@@ -119,26 +119,44 @@ def from_body(
119119 id = hash_bytes (f"{ document_id } -{ index } " .encode ()),
120120 document_id = document_id ,
121121 index = index ,
122- headings = headings ,
122+ headings = Chunk . truncate_headings ( headings , body ) ,
123123 body = body ,
124124 metadata_ = kwargs ,
125125 )
126126
127- def extract_headings (self ) -> str :
128- """Extract Markdown headings from the chunk, starting from the current Markdown headings."""
127+ @staticmethod
128+ def extract_heading_lines (doc : str , leading_only : bool = False ) -> list [str ]: # noqa: FBT001,FBT002
129+ """Extract the leading or final state of the Markdown headings of a document."""
129130 md = MarkdownIt ()
130- heading_lines = ["" ] * 10
131+ heading_lines = ["" ] * 6
131132 level = None
132- for doc in (self .headings , self .body ):
133- for token in md .parse (doc ):
134- if token .type == "heading_open" :
135- level = int (token .tag [1 ])
136- elif token .type == "heading_close" :
137- level = None
138- elif level is not None :
139- heading_content = token .content .strip ().replace ("\n " , " " )
140- heading_lines [level ] = ("#" * level ) + " " + heading_content
141- heading_lines [level + 1 :] = ["" ] * len (heading_lines [level + 1 :])
133+ for token in md .parse (doc ):
134+ if token .type == "heading_open" :
135+ level = int (token .tag [1 ]) if 1 <= int (token .tag [1 ]) <= 6 else None # noqa: PLR2004
136+ elif token .type == "heading_close" :
137+ level = None
138+ elif level is not None :
139+ heading_content = token .content .strip ().replace ("\n " , " " )
140+ heading_lines [level - 1 ] = ("#" * level ) + " " + heading_content
141+ heading_lines [level :] = ["" ] * len (heading_lines [level + 1 :])
142+ elif leading_only and level is None and token .content and not token .content .isspace ():
143+ break
144+ return heading_lines
145+
146+ @staticmethod
147+ def truncate_headings (headings : str , body : str ) -> str :
148+ """Truncate the contextual headings given the chunk's leading headings (if present)."""
149+ heading_lines = Chunk .extract_heading_lines (headings )
150+ leading_body_heading_lines = Chunk .extract_heading_lines (body , leading_only = True )
151+ level = next ((i + 1 for i , line in enumerate (leading_body_heading_lines ) if line ), None )
152+ if level :
153+ heading_lines [level - 1 :] = ["" ] * len (heading_lines [level - 1 :])
154+ headings = "\n " .join ([heading for heading in heading_lines if heading ])
155+ return headings
156+
157+ def extract_headings (self ) -> str :
158+ """Extract Markdown headings from the chunk, starting from the contextual headings."""
159+ heading_lines = self .extract_heading_lines (self .headings + "\n \n " + self .body )
142160 headings = "\n " .join ([heading for heading in heading_lines if heading ])
143161 return headings
144162
0 commit comments