Merge pull request #663 from Crozzers/fix-safemode-regressions

nicholasserra · web-flow · commit a30892f649ab · 2025-11-17T12:04:10.000-05:00
Fix safemode regressions (#660)
diff --git a/lib/markdown2.py b/lib/markdown2.py
@@ -1358,6 +1358,14 @@ def _is_comment(token):
                 return
             return re.match(r'(<!--)(.*)(-->)', token)
 
+        # protect raw code spans from processing, as they can often contain anything that looks like HTML and
+        # trips up the regex. These are encoded and processed later on anyway
+        code_hashes = {}
+        text = self._code_span_re.sub(
+            lambda m: self._hash_span(m.string[m.start(): m.end()], code_hashes),
+            text
+        )
+
         tokens = []
         split_tokens = self._sorta_html_tokenize_re.split(text)
         index = 0
@@ -1386,7 +1394,12 @@ def _is_comment(token):
             else:
                 tokens.append(self._encode_incomplete_tags(token))
             index += 1
-        return ''.join(tokens)
+
+        text = ''.join(tokens)
+        # put markdown code spans back into the text for processing
+        for key, code in code_hashes.items():
+            text = text.replace(key, code)
+        return text
 
     def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
         '''
@@ -2219,7 +2232,7 @@ def _encode_amps_and_angles(self, text: str) -> str:
         text = self._naked_gt_re.sub('&gt;', text)
         return text
 
-    _incomplete_tags_re = re.compile(r"<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))")
+    _incomplete_tags_re = re.compile(r"\\*<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))")
 
     def _encode_incomplete_tags(self, text: str) -> str:
         if self.safe_mode not in ("replace", "escape"):
@@ -2228,24 +2241,15 @@ def _encode_incomplete_tags(self, text: str) -> str:
         if self._is_auto_link(text):
             return text  # this is not an incomplete tag, this is a link in the form <http://x.y.z>
 
-        # protect code blocks. code blocks may have stuff like `C:\<folder>` in which is NOT a tag
-        # and will get encoded anyway in _encode_code
-        hashes = {}
-        for span in self._code_span_re.findall(text):
-            # the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing
-            span = span[0] + span[1] + span[0]
-            hashed = _hash_text(span)
-            hashes[hashed] = span
-            text = text.replace(span, hashed)
-
         def incomplete_tags_sub(match):
-            return match.group().replace('<', '&lt;')
+            text = match.group()
+            # ensure that we handle escaped incomplete tags properly by consuming and replacing the escapes
+            if not self._is_unescaped_re.match(text):
+                text = text.replace('\\<', '&lt;')
+            return text.replace('<', '&lt;')
 
         text = self._incomplete_tags_re.sub(incomplete_tags_sub, text)
 
-        for hashed, original in hashes.items():
-            text = text.replace(hashed, original)
-
         return text
 
     def _encode_backslash_escapes(self, text: str) -> str:
@@ -2314,13 +2318,23 @@ def _outdent(self, text: str) -> str:
         # Remove one level of line-leading tabs or spaces
         return self._outdent_re.sub('', text)
 
-    def _hash_span(self, text: str) -> str:
+    def _hash_span(self, text: str, hash_table: Optional[dict] = None) -> str:
         '''
         Wrapper around `_hash_text` that also adds the hash to `self.hash_spans`,
         meaning it will be automatically unhashed during conversion.
+
+        Args:
+            text: the text to hash
+            hash_table: the dict to insert the hash into. If omitted will default to `self.html_spans`
+
+        Returns:
+            The hashed text
         '''
         key = _hash_text(text)
-        self.html_spans[key] = text
+        if hash_table is not None:
+            hash_table[key] = text
+        else:
+            self.html_spans[key] = text
         return key
 
     @staticmethod
@@ -2559,9 +2573,7 @@ def sub(self, match: re.Match) -> str:
 
     def sub_hash(self, match: re.Match) -> str:
         substr = match.string[match.start(): match.end()]
-        key = _hash_text(substr)
-        self.hash_table[key] = substr
-        return key
+        return self.md._hash_span(substr, self.hash_table)
 
     def test(self, text):
         if self.md.order < Stage.ITALIC_AND_BOLD:
@@ -3124,7 +3136,7 @@ def unhash_code(codeblock):
                                                **formatter_opts)
 
         # add back the indent to all lines
-        return "\n%s\n" % self.md._uniform_indent(colored, leading_indent, True)
+        return self.md._uniform_indent(colored, leading_indent, True)
 
     def tags(self, lexer_name: str) -> tuple[str, str]:
         '''
@@ -3149,12 +3161,20 @@ def sub(self, match: re.Match) -> str:
         codeblock = match.group(3)
         codeblock = codeblock[:-1]  # drop one trailing newline
 
+        # figure out what newlines were already surrounding the code block and preserve them in the output
+        leading_newlines = match.string[match.start(): match.regs[1][0]]
+        trailing_newlines = re.search(r'\n*$', match.group()).group()
+
         # Use pygments only if not using the highlightjs-lang extra
         if lexer_name and "highlightjs-lang" not in self.md.extras:
             lexer = self.md._get_pygments_lexer(lexer_name)
             if lexer:
-                leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip()))
-                return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer)
+                leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
+                return (
+                    leading_newlines
+                    + self._code_block_with_lexer_sub(codeblock, leading_indent, lexer)
+                    + trailing_newlines
+                )
 
         # Fenced code blocks need to be outdented before encoding, and then reapplied
         leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
@@ -3166,18 +3186,12 @@ def sub(self, match: re.Match) -> str:
 
         tags = self.tags(lexer_name)
 
-        # when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text
-        # ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines
-        # in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same
-        # `\n\n` wrap. We can correct that here
-        surrounding_newlines = '\n\n' if self.md.safe_mode else '\n'
-
         return (
-            f'{surrounding_newlines}'
+            f'{leading_newlines}'
             f'{leading_indent}{tags[0]}'
             f'{codeblock}'
             f'\n{leading_indent}{tags[1]}'
-            f'{surrounding_newlines}'
+            f'{trailing_newlines}'
         )
 
     def run(self, text):
@@ -3296,8 +3310,7 @@ def run(self, text):
                         .replace('*', self.md._escape_table['*'])
                         .replace('_', self.md._escape_table['_']))
                 link = '<a href="{}">{}</a>'.format(escaped_href, text[start:end])
-                hash = _hash_text(link)
-                link_from_hash[hash] = link
+                hash = self.md._hash_span(link, link_from_hash)
                 text = text[:start] + hash + text[end:]
         for hash, link in list(link_from_hash.items()):
             text = text.replace(hash, link)
diff --git a/test/tm-cases/escaped_html_in_safe_mode.html b/test/tm-cases/escaped_html_in_safe_mode.html
@@ -1,2 +1,3 @@
 <p>&lt;abc&gt;
-&lt;abc></p>
+&lt;abc>
+&lt;why?</p>
diff --git a/test/tm-cases/escaped_html_in_safe_mode.text b/test/tm-cases/escaped_html_in_safe_mode.text
@@ -1,2 +1,3 @@
 \<abc\>
-\<abc>
+\<abc>
+\<why?
diff --git a/test/tm-cases/incomplete_tags_in_code_spans.html b/test/tm-cases/incomplete_tags_in_code_spans.html
@@ -1,3 +1,5 @@
-<p>This appears to be an incomplete tag, but it's not because it's in a code span.</p>
+<p>This appears to be incomplete tags, but they're not because they're in code spans.</p>
 
 <p>Path: <code>C:\&lt;folder 1&gt;</code></p>
+
+<p>Path: <code>C:\&lt;folder&gt;</code></p>
diff --git a/test/tm-cases/incomplete_tags_in_code_spans.text b/test/tm-cases/incomplete_tags_in_code_spans.text
@@ -1,3 +1,5 @@
-This appears to be an incomplete tag, but it's not because it's in a code span.
+This appears to be incomplete tags, but they're not because they're in code spans.
 
-Path: `C:\<folder 1>`
+Path: `C:\<folder 1>`
+
+Path: `C:\<folder>`
diff --git a/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text b/test/tm-cases/safe_mode_fenced_code_joined_to_lists.text
@@ -3,4 +3,4 @@
 
 ```
 // Some code
-```
+```
diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.html b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.html
@@ -0,0 +1,9 @@
+<ul>
+<li>Item 1</li>
+<li>Item 2</li>
+</ul>
+
+<div class="codehilite">
+<pre><span></span><code><span class="c1">// Some code</span>
+</code></pre>
+</div>
diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.opts b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.opts
@@ -0,0 +1 @@
+{'safe_mode': 'escape', 'extras': ['fenced-code-blocks']}
diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.tags b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.tags
@@ -0,0 +1 @@
+fenced-code-blocks pygments
diff --git a/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.text b/test/tm-cases/safe_mode_fenced_code_with_lexer_joined_to_lists.text
@@ -0,0 +1,6 @@
+* Item 1
+* Item 2
+
+```cpp
+// Some code
+```

-Original file line number
+Diff line change
@@ @@ -1,2 +1,3 @@ @@
 \<abc\>
 -\<abc>
 +\<abc>
 +\<why?
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{'safe_mode': 'escape', 'extras': ['fenced-code-blocks']}`