Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,15 @@ def __init__(

# -- accommodate pathlib.Path for filename --
filename = str(filename) if isinstance(filename, pathlib.Path) else filename
# -- produces "", "" when filename arg is None --
directory_path, file_name = os.path.split(filename or "")

# -- avoid calling os.path.split when filename is falsy to reduce overhead --
if filename:
directory_path, file_name = os.path.split(filename)
else:
directory_path, file_name = "", ""

# -- prefer `file_directory` arg if specified, otherwise split of file-path passed as
# -- `filename` arg, or None if `filename` is the empty string.
# -- prefer `file_directory` arg if specified, otherwise split of file-path passed as
# -- `filename` arg, or None if `filename` is the empty string.
self.file_directory = file_directory or directory_path or None
Expand Down Expand Up @@ -334,8 +341,6 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata:
"""
from unstructured.staging.base import elements_from_base64_gzipped_json

# -- avoid unexpected mutation by working on a copy of provided dict --
meta_dict = copy.deepcopy(meta_dict)
self = ElementMetadata()
for field_name, field_value in meta_dict.items():
if field_name == "coordinates":
Expand All @@ -345,7 +350,11 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata:
elif field_name == "orig_elements":
self.orig_elements = elements_from_base64_gzipped_json(field_value)
elif field_name == "key_value_pairs":
self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
# _kvform_rehydrate_internal_elements mutates the kv_pairs argument in-place,
# so deepcopy only this portion to avoid mutating the caller's structure.
self.key_value_pairs = _kvform_rehydrate_internal_elements(
copy.deepcopy(field_value)
)
else:
setattr(self, field_name, field_value)

Expand Down Expand Up @@ -1036,6 +1045,8 @@ def _kvform_rehydrate_internal_elements(kv_pairs: list[dict[str, Any]]) -> list[
"""
from unstructured.staging.base import elements_from_dicts

Points: TypeAlias = "tuple[Point, ...]"

# safe to overwrite - deepcopy already happened
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
Expand Down
27 changes: 15 additions & 12 deletions unstructured/staging/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,22 @@ def elements_from_dicts(element_dicts: Iterable[dict[str, Any]]) -> list[Element
"""Convert a list of element-dicts to a list of elements."""
elements: list[Element] = []

elems_append = elements.append
map_ = TYPE_TO_TEXT_ELEMENT_MAP

for item in element_dicts:
element_id: str = item.get("element_id", None)
metadata = (
ElementMetadata()
if item.get("metadata") is None
else ElementMetadata.from_dict(item["metadata"])
)

if item.get("type") in TYPE_TO_TEXT_ELEMENT_MAP:
ElementCls = TYPE_TO_TEXT_ELEMENT_MAP[item["type"]]
elements.append(ElementCls(text=item["text"], element_id=element_id, metadata=metadata))
elif item.get("type") == "CheckBox":
elements.append(
# Bind lookups locally to avoid repeated dict.get calls
get = item.get
element_id: str = get("element_id", None)
md = get("metadata")
metadata = ElementMetadata() if md is None else ElementMetadata.from_dict(md)

type_ = get("type")
if type_ in map_:
ElementCls = map_[type_]
elems_append(ElementCls(text=item["text"], element_id=element_id, metadata=metadata))
elif type_ == "CheckBox":
elems_append(
CheckBox(checked=item["checked"], element_id=element_id, metadata=metadata)
)

Expand Down