diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index b64c7bd19c..52b568502c 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -314,10 +314,12 @@ def remove_punctuation(s: str) -> str: def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: - tbl_new = tbl.copy() + tbl_new = tbl # avoid copying when no exclusions are requested if exclude_punctuation: + tbl_new = tbl.copy() + o = ord for punct in exclude_punctuation: - del tbl_new[ord(punct)] + del tbl_new[o(punct)] s = s.translate(tbl_new) return s diff --git a/unstructured/metrics/text_extraction.py b/unstructured/metrics/text_extraction.py index 7153852305..b90a511ef5 100644 --- a/unstructured/metrics/text_extraction.py +++ b/unstructured/metrics/text_extraction.py @@ -84,26 +84,23 @@ def bag_of_words(text: str) -> Dict[str, int]: words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split() i = 0 - while i < len(words): - if len(words[i]) > 1: - if words[i] in bow: - bow[words[i]] += 1 - else: - bow[words[i]] = 1 + n = len(words) + while i < n: + w = words[i] + lw = len(w) + if lw > 1: + bow[w] = bow.get(w, 0) + 1 i += 1 else: - j = i - incorrect_word = "" - - while j < len(words) and len(words[j]) == 1: - incorrect_word += words[j] + # Find the extent of the run of single-character tokens. + j = i + 1 + while j < n and len(words[j]) == 1: j += 1 - if len(incorrect_word) == 1 and words[i].isalnum(): - if incorrect_word in bow: - bow[incorrect_word] += 1 - else: - bow[incorrect_word] = 1 + # If the run is exactly one single-character token and it's alphanumeric, + # count it. This preserves the original logic without constructing strings. + if j == i + 1 and w.isalnum(): + bow[w] = bow.get(w, 0) + 1 i = j return bow