|
2 | 2 | Copyright (c) 2024, 2025, Oracle and/or its affiliates. |
3 | 3 | Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl. |
4 | 4 | """ |
5 | | -# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs |
| 5 | +# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs, genai, hnsw |
6 | 6 |
|
7 | 7 | import json |
8 | 8 | import copy |
|
22 | 22 | from langchain_core.language_models.chat_models import BaseChatModel |
23 | 23 | from langchain.docstore.document import Document as LangchainDocument |
24 | 24 | from langchain.text_splitter import RecursiveCharacterTextSplitter |
25 | | -from langchain_text_splitters import HTMLSectionSplitter, CharacterTextSplitter |
| 25 | +from langchain_text_splitters import HTMLHeaderTextSplitter, CharacterTextSplitter |
26 | 26 |
|
27 | 27 | import server.utils.databases as databases |
28 | 28 |
|
@@ -130,20 +130,19 @@ def split_document( |
130 | 130 | ("h4", "Header 4"), |
131 | 131 | ("h5", "Header 5"), |
132 | 132 | ] |
133 | | - html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on) |
| 133 | + html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) |
134 | 134 | ################################## |
135 | 135 | # Splitters - End |
136 | 136 | ################################## |
137 | 137 | match extension.lower(): |
138 | 138 | case "pdf": |
139 | 139 | doc_split = text_splitter.split_documents(document) |
140 | 140 | case "html": |
141 | | - try: |
142 | | - html_split = html_splitter.split_documents(document) |
143 | | - except Exception as ex: |
144 | | - logger.exception(ex) |
145 | | - html_split = document |
146 | | - doc_split = text_splitter.split_documents(html_split) |
| 141 | + tmp_meta = document[0].metadata |
| 142 | + doc_split = html_splitter.split_text(document[0].page_content) |
| 143 | + # Update metadata with source |
| 144 | + for doc in doc_split: |
| 145 | + doc.metadata.update(tmp_meta) |
147 | 146 | case "pdf" | "md" | "txt" | "csv": |
148 | 147 | doc_split = text_splitter.split_documents(document) |
149 | 148 | case _: |
@@ -180,7 +179,8 @@ def load_and_split_documents( |
180 | 179 | case "pdf": |
181 | 180 | loader = document_loaders.PyPDFLoader(file) |
182 | 181 | case "html": |
183 | | - loader = document_loaders.UnstructuredHTMLLoader(file) |
| 182 | + # Use TextLoader to preserve for header split |
| 183 | + loader = document_loaders.TextLoader(file) |
184 | 184 | case "md": |
185 | 185 | loader = document_loaders.TextLoader(file) |
186 | 186 | case "csv": |
|
0 commit comments