diff --git a/CHANGELOG.md b/CHANGELOG.md index baa69aae9f..4da58bbc9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,9 @@ ### Features ### Fixes +- The sort_page_element() use the element id to sort the elements. +Two executions of the same code, on the same file, produce different results. The order of the elements is random. +This makes it impossible to write stable unit tests, for example, or to obtain reproducible results. - **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) ## 0.17.5 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 7a0c8ff29c..70eec35fd7 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -1603,3 +1603,24 @@ def test_partition_pdf_with_specified_ocr_agents(mocker): assert spy.call_args_list[0][1] == {"language": "eng", "ocr_agent_module": OCR_AGENT_TESSERACT} assert spy.call_args_list[1][1] == {"language": "en", "ocr_agent_module": OCR_AGENT_PADDLE} + + +def test_reproductible_pdf_loader(): + from glob import glob + + for f in glob(example_doc_path("pdf/layout-parser-paper.pdf")): + elements_1 = pdf.partition_pdf( + filename=f, + strategy=PartitionStrategy.AUTO, + infer_table_structure=False, + ) + for _ in range(4): + elements_2 = pdf.partition_pdf( + filename=f, + strategy=PartitionStrategy.AUTO, + infer_table_structure=False, + ) + for e1, e2 in zip(elements_1, elements_2): + assert e1.text == e2.text, f"load two time {f=} return differents results" + else: + break diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py index 8cdc885dd1..59d550958b 100644 --- a/unstructured/partition/utils/sorting.py +++ b/unstructured/partition/utils/sorting.py @@ -179,7 +179,6 @@ def _coords_ok(strict_points: bool): key=lambda el: ( el.metadata.coordinates.points[0][1] if el.metadata.coordinates else float("inf"), el.metadata.coordinates.points[0][0] if el.metadata.coordinates else float("inf"), - el.id, ), ) else: