Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions unstructured/cleaners/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,10 +314,12 @@ def remove_punctuation(s: str) -> str:


def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str:
tbl_new = tbl.copy()
tbl_new = tbl # avoid copying when no exclusions are requested
if exclude_punctuation:
tbl_new = tbl.copy()
o = ord
for punct in exclude_punctuation:
del tbl_new[ord(punct)]
del tbl_new[o(punct)]
s = s.translate(tbl_new)
return s

Expand Down
29 changes: 13 additions & 16 deletions unstructured/metrics/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,26 +84,23 @@ def bag_of_words(text: str) -> Dict[str, int]:
words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split()

i = 0
while i < len(words):
if len(words[i]) > 1:
if words[i] in bow:
bow[words[i]] += 1
else:
bow[words[i]] = 1
n = len(words)
while i < n:
w = words[i]
lw = len(w)
if lw > 1:
bow[w] = bow.get(w, 0) + 1
i += 1
else:
j = i
incorrect_word = ""

while j < len(words) and len(words[j]) == 1:
incorrect_word += words[j]
# Find the extent of the run of single-character tokens.
j = i + 1
while j < n and len(words[j]) == 1:
j += 1

if len(incorrect_word) == 1 and words[i].isalnum():
if incorrect_word in bow:
bow[incorrect_word] += 1
else:
bow[incorrect_word] = 1
# If the run is exactly one single-character token and it's alphanumeric,
# count it. This preserves the original logic without constructing strings.
if j == i + 1 and w.isalnum():
bow[w] = bow.get(w, 0) + 1
i = j
return bow

Expand Down