Unstructured-IO · badGarnet · Apr 7, 2025 · Apr 3, 2025 · Apr 4, 2025 · Apr 7, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,9 @@
 ### Features
 
 ### Fixes
+- The sort_page_element() use the element id to sort the elements.
+Two executions of the same code, on the same file, produce different results. The order of the elements is random.
+This makes it impossible to write stable unit tests, for example, or to obtain reproducible results.
 - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`)
 
 ## 0.17.5

diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1603,3 +1603,24 @@ def test_partition_pdf_with_specified_ocr_agents(mocker):
 
     assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT}
     assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE}
+
+
+def test_reproductible_pdf_loader():
+    from glob import glob
+
+    for f in glob(example_doc_path("pdf/layout-parser-paper.pdf")):
+        elements_1 = pdf.partition_pdf(
+            filename=f,
+            strategy=PartitionStrategy.AUTO,
+            infer_table_structure=False,
+        )
+        for _ in range(4):
+            elements_2 = pdf.partition_pdf(
+                filename=f,
+                strategy=PartitionStrategy.AUTO,
+                infer_table_structure=False,
+            )
+            for e1, e2 in zip(elements_1, elements_2):
+                assert e1.text == e2.text, f"load two time {f=} return differents results"
+            else:
+                break
diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py
@@ -179,7 +179,6 @@ def _coords_ok(strict_points: bool):
             key=lambda el: (
                 el.metadata.coordinates.points[0][1] if el.metadata.coordinates else float("inf"),
                 el.metadata.coordinates.points[0][0] if el.metadata.coordinates else float("inf"),
-                el.id,
             ),
         )
     else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -179,7 +179,6 @@ def _coords_ok(strict_points: bool): @@
                 key=lambda el: (
                     el.metadata.coordinates.points[0][1] if el.metadata.coordinates else float("inf"),
                     el.metadata.coordinates.points[0][0] if el.metadata.coordinates else float("inf"),
-                    el.id,
                 ),
             )
         else:
@@ Expand Down @@