From 133d19e059a3dc279547d4cea7f0bd04bfcb7737 Mon Sep 17 00:00:00 2001 From: Farruh Sheripov <66794734+Sheripov@users.noreply.github.com> Date: Fri, 7 Feb 2025 15:55:30 +0100 Subject: [PATCH] Update html_table.py --- unstructured/common/html_table.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py index a441e5a57b..a215817991 100644 --- a/unstructured/common/html_table.py +++ b/unstructured/common/html_table.py @@ -108,9 +108,8 @@ def iter_rows(self) -> Iterator[HtmlRow]: @lazyproperty def text(self) -> str: """The clean, concatenated, text for this table.""" - table_text = " ".join(self._table.itertext()) - # -- blank cells will introduce extra whitespace, so normalize after accumulating -- - return " ".join(table_text.split()) + # improve readability of the text + return "\n".join([", ".join([j for j in i.iter_cell_texts()]) for i in self.iter_rows()]) class HtmlRow: