@@ -1358,6 +1358,14 @@ def _is_comment(token):
13581358 return
13591359 return re .match (r'(<!--)(.*)(-->)' , token )
13601360
1361+ # protect raw code spans from processing, as they can often contain anything that looks like HTML and
1362+ # trips up the regex. These are encoded and processed later on anyway
1363+ code_hashes = {}
1364+ text = self ._code_span_re .sub (
1365+ lambda m : self ._hash_span (m .string [m .start (): m .end ()], code_hashes ),
1366+ text
1367+ )
1368+
13611369 tokens = []
13621370 split_tokens = self ._sorta_html_tokenize_re .split (text )
13631371 index = 0
@@ -1386,7 +1394,12 @@ def _is_comment(token):
13861394 else :
13871395 tokens .append (self ._encode_incomplete_tags (token ))
13881396 index += 1
1389- return '' .join (tokens )
1397+
1398+ text = '' .join (tokens )
1399+ # put markdown code spans back into the text for processing
1400+ for key , code in code_hashes .items ():
1401+ text = text .replace (key , code )
1402+ return text
13901403
13911404 def _unhash_html_spans (self , text : str , spans = True , code = False ) -> str :
13921405 '''
@@ -2219,7 +2232,7 @@ def _encode_amps_and_angles(self, text: str) -> str:
22192232 text = self ._naked_gt_re .sub ('>' , text )
22202233 return text
22212234
2222- _incomplete_tags_re = re .compile (r"<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))" )
2235+ _incomplete_tags_re = re .compile (r"\\* <(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))" )
22232236
22242237 def _encode_incomplete_tags (self , text : str ) -> str :
22252238 if self .safe_mode not in ("replace" , "escape" ):
@@ -2228,24 +2241,15 @@ def _encode_incomplete_tags(self, text: str) -> str:
22282241 if self ._is_auto_link (text ):
22292242 return text # this is not an incomplete tag, this is a link in the form <http://x.y.z>
22302243
2231- # protect code blocks. code blocks may have stuff like `C:\<folder>` in which is NOT a tag
2232- # and will get encoded anyway in _encode_code
2233- hashes = {}
2234- for span in self ._code_span_re .findall (text ):
2235- # the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing
2236- span = span [0 ] + span [1 ] + span [0 ]
2237- hashed = _hash_text (span )
2238- hashes [hashed ] = span
2239- text = text .replace (span , hashed )
2240-
22412244 def incomplete_tags_sub (match ):
2242- return match .group ().replace ('<' , '<' )
2245+ text = match .group ()
2246+ # ensure that we handle escaped incomplete tags properly by consuming and replacing the escapes
2247+ if not self ._is_unescaped_re .match (text ):
2248+ text = text .replace ('\\ <' , '<' )
2249+ return text .replace ('<' , '<' )
22432250
22442251 text = self ._incomplete_tags_re .sub (incomplete_tags_sub , text )
22452252
2246- for hashed , original in hashes .items ():
2247- text = text .replace (hashed , original )
2248-
22492253 return text
22502254
22512255 def _encode_backslash_escapes (self , text : str ) -> str :
@@ -2314,13 +2318,23 @@ def _outdent(self, text: str) -> str:
23142318 # Remove one level of line-leading tabs or spaces
23152319 return self ._outdent_re .sub ('' , text )
23162320
2317- def _hash_span (self , text : str ) -> str :
2321+ def _hash_span (self , text : str , hash_table : Optional [ dict ] = None ) -> str :
23182322 '''
23192323 Wrapper around `_hash_text` that also adds the hash to `self.hash_spans`,
23202324 meaning it will be automatically unhashed during conversion.
2325+
2326+ Args:
2327+ text: the text to hash
2328+ hash_table: the dict to insert the hash into. If omitted will default to `self.html_spans`
2329+
2330+ Returns:
2331+ The hashed text
23212332 '''
23222333 key = _hash_text (text )
2323- self .html_spans [key ] = text
2334+ if hash_table is not None :
2335+ hash_table [key ] = text
2336+ else :
2337+ self .html_spans [key ] = text
23242338 return key
23252339
23262340 @staticmethod
@@ -2559,9 +2573,7 @@ def sub(self, match: re.Match) -> str:
25592573
25602574 def sub_hash (self , match : re .Match ) -> str :
25612575 substr = match .string [match .start (): match .end ()]
2562- key = _hash_text (substr )
2563- self .hash_table [key ] = substr
2564- return key
2576+ return self .md ._hash_span (substr , self .hash_table )
25652577
25662578 def test (self , text ):
25672579 if self .md .order < Stage .ITALIC_AND_BOLD :
@@ -3124,7 +3136,7 @@ def unhash_code(codeblock):
31243136 ** formatter_opts )
31253137
31263138 # add back the indent to all lines
3127- return " \n %s \n " % self .md ._uniform_indent (colored , leading_indent , True )
3139+ return self .md ._uniform_indent (colored , leading_indent , True )
31283140
31293141 def tags (self , lexer_name : str ) -> tuple [str , str ]:
31303142 '''
@@ -3149,12 +3161,20 @@ def sub(self, match: re.Match) -> str:
31493161 codeblock = match .group (3 )
31503162 codeblock = codeblock [:- 1 ] # drop one trailing newline
31513163
3164+ # figure out what newlines were already surrounding the code block and preserve them in the output
3165+ leading_newlines = match .string [match .start (): match .regs [1 ][0 ]]
3166+ trailing_newlines = re .search (r'\n*$' , match .group ()).group ()
3167+
31523168 # Use pygments only if not using the highlightjs-lang extra
31533169 if lexer_name and "highlightjs-lang" not in self .md .extras :
31543170 lexer = self .md ._get_pygments_lexer (lexer_name )
31553171 if lexer :
3156- leading_indent = ' ' * (len (match .group (1 )) - len (match .group (1 ).lstrip ()))
3157- return self ._code_block_with_lexer_sub (codeblock , leading_indent , lexer )
3172+ leading_indent = ' ' * (len (match .group (1 )) - len (match .group (1 ).lstrip ()))
3173+ return (
3174+ leading_newlines
3175+ + self ._code_block_with_lexer_sub (codeblock , leading_indent , lexer )
3176+ + trailing_newlines
3177+ )
31583178
31593179 # Fenced code blocks need to be outdented before encoding, and then reapplied
31603180 leading_indent = ' ' * (len (match .group (1 )) - len (match .group (1 ).lstrip ()))
@@ -3166,18 +3186,12 @@ def sub(self, match: re.Match) -> str:
31663186
31673187 tags = self .tags (lexer_name )
31683188
3169- # when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text
3170- # ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines
3171- # in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same
3172- # `\n\n` wrap. We can correct that here
3173- surrounding_newlines = '\n \n ' if self .md .safe_mode else '\n '
3174-
31753189 return (
3176- f'{ surrounding_newlines } '
3190+ f'{ leading_newlines } '
31773191 f'{ leading_indent } { tags [0 ]} '
31783192 f'{ codeblock } '
31793193 f'\n { leading_indent } { tags [1 ]} '
3180- f'{ surrounding_newlines } '
3194+ f'{ trailing_newlines } '
31813195 )
31823196
31833197 def run (self , text ):
@@ -3296,8 +3310,7 @@ def run(self, text):
32963310 .replace ('*' , self .md ._escape_table ['*' ])
32973311 .replace ('_' , self .md ._escape_table ['_' ]))
32983312 link = '<a href="{}">{}</a>' .format (escaped_href , text [start :end ])
3299- hash = _hash_text (link )
3300- link_from_hash [hash ] = link
3313+ hash = self .md ._hash_span (link , link_from_hash )
33013314 text = text [:start ] + hash + text [end :]
33023315 for hash , link in list (link_from_hash .items ()):
33033316 text = text .replace (hash , link )
0 commit comments