diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index dbf4c4d3ef..014aab926d 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -266,8 +266,15 @@ def __init__( # -- accommodate pathlib.Path for filename -- filename = str(filename) if isinstance(filename, pathlib.Path) else filename - # -- produces "", "" when filename arg is None -- - directory_path, file_name = os.path.split(filename or "") + + # -- avoid calling os.path.split when filename is falsy to reduce overhead -- + if filename: + directory_path, file_name = os.path.split(filename) + else: + directory_path, file_name = "", "" + + # -- prefer `file_directory` arg if specified, otherwise split of file-path passed as + # -- `filename` arg, or None if `filename` is the empty string. # -- prefer `file_directory` arg if specified, otherwise split of file-path passed as # -- `filename` arg, or None if `filename` is the empty string. self.file_directory = file_directory or directory_path or None @@ -334,8 +341,6 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: """ from unstructured.staging.base import elements_from_base64_gzipped_json - # -- avoid unexpected mutation by working on a copy of provided dict -- - meta_dict = copy.deepcopy(meta_dict) self = ElementMetadata() for field_name, field_value in meta_dict.items(): if field_name == "coordinates": @@ -345,7 +350,11 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata: elif field_name == "orig_elements": self.orig_elements = elements_from_base64_gzipped_json(field_value) elif field_name == "key_value_pairs": - self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value) + # _kvform_rehydrate_internal_elements mutates the kv_pairs argument in-place, + # so deepcopy only this portion to avoid mutating the caller's structure. + self.key_value_pairs = _kvform_rehydrate_internal_elements( + copy.deepcopy(field_value) + ) else: setattr(self, field_name, field_value) @@ -1036,6 +1045,8 @@ def _kvform_rehydrate_internal_elements(kv_pairs: list[dict[str, Any]]) -> list[ """ from unstructured.staging.base import elements_from_dicts + Points: TypeAlias = "tuple[Point, ...]" + # safe to overwrite - deepcopy already happened for kv_pair in kv_pairs: if kv_pair["key"]["custom_element"] is not None: diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index aab1b1647f..0b2e22f75a 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -58,19 +58,22 @@ def elements_from_dicts(element_dicts: Iterable[dict[str, Any]]) -> list[Element """Convert a list of element-dicts to a list of elements.""" elements: list[Element] = [] + elems_append = elements.append + map_ = TYPE_TO_TEXT_ELEMENT_MAP + for item in element_dicts: - element_id: str = item.get("element_id", None) - metadata = ( - ElementMetadata() - if item.get("metadata") is None - else ElementMetadata.from_dict(item["metadata"]) - ) - - if item.get("type") in TYPE_TO_TEXT_ELEMENT_MAP: - ElementCls = TYPE_TO_TEXT_ELEMENT_MAP[item["type"]] - elements.append(ElementCls(text=item["text"], element_id=element_id, metadata=metadata)) - elif item.get("type") == "CheckBox": - elements.append( + # Bind lookups locally to avoid repeated dict.get calls + get = item.get + element_id: str = get("element_id", None) + md = get("metadata") + metadata = ElementMetadata() if md is None else ElementMetadata.from_dict(md) + + type_ = get("type") + if type_ in map_: + ElementCls = map_[type_] + elems_append(ElementCls(text=item["text"], element_id=element_id, metadata=metadata)) + elif type_ == "CheckBox": + elems_append( CheckBox(checked=item["checked"], element_id=element_id, metadata=metadata) )