Skip to content

Commit a30892f

Browse files
Merge pull request #663 from Crozzers/fix-safemode-regressions
Fix safemode regressions (#660)
2 parents 3fe9325 + d59fb8e commit a30892f

10 files changed

+76
-40
lines changed

lib/markdown2.py

Lines changed: 47 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,6 +1358,14 @@ def _is_comment(token):
13581358
return
13591359
return re.match(r'(<!--)(.*)(-->)', token)
13601360

1361+
# protect raw code spans from processing, as they can often contain anything that looks like HTML and
1362+
# trips up the regex. These are encoded and processed later on anyway
1363+
code_hashes = {}
1364+
text = self._code_span_re.sub(
1365+
lambda m: self._hash_span(m.string[m.start(): m.end()], code_hashes),
1366+
text
1367+
)
1368+
13611369
tokens = []
13621370
split_tokens = self._sorta_html_tokenize_re.split(text)
13631371
index = 0
@@ -1386,7 +1394,12 @@ def _is_comment(token):
13861394
else:
13871395
tokens.append(self._encode_incomplete_tags(token))
13881396
index += 1
1389-
return ''.join(tokens)
1397+
1398+
text = ''.join(tokens)
1399+
# put markdown code spans back into the text for processing
1400+
for key, code in code_hashes.items():
1401+
text = text.replace(key, code)
1402+
return text
13901403

13911404
def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
13921405
'''
@@ -2219,7 +2232,7 @@ def _encode_amps_and_angles(self, text: str) -> str:
22192232
text = self._naked_gt_re.sub('&gt;', text)
22202233
return text
22212234

2222-
_incomplete_tags_re = re.compile(r"<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))")
2235+
_incomplete_tags_re = re.compile(r"\\*<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))")
22232236

22242237
def _encode_incomplete_tags(self, text: str) -> str:
22252238
if self.safe_mode not in ("replace", "escape"):
@@ -2228,24 +2241,15 @@ def _encode_incomplete_tags(self, text: str) -> str:
22282241
if self._is_auto_link(text):
22292242
return text # this is not an incomplete tag, this is a link in the form <http://x.y.z>
22302243

2231-
# protect code blocks. code blocks may have stuff like `C:\<folder>` in which is NOT a tag
2232-
# and will get encoded anyway in _encode_code
2233-
hashes = {}
2234-
for span in self._code_span_re.findall(text):
2235-
# the regex matches 2 groups: the syntax and the context. Reconstruct the entire match for easier processing
2236-
span = span[0] + span[1] + span[0]
2237-
hashed = _hash_text(span)
2238-
hashes[hashed] = span
2239-
text = text.replace(span, hashed)
2240-
22412244
def incomplete_tags_sub(match):
2242-
return match.group().replace('<', '&lt;')
2245+
text = match.group()
2246+
# ensure that we handle escaped incomplete tags properly by consuming and replacing the escapes
2247+
if not self._is_unescaped_re.match(text):
2248+
text = text.replace('\\<', '&lt;')
2249+
return text.replace('<', '&lt;')
22432250

22442251
text = self._incomplete_tags_re.sub(incomplete_tags_sub, text)
22452252

2246-
for hashed, original in hashes.items():
2247-
text = text.replace(hashed, original)
2248-
22492253
return text
22502254

22512255
def _encode_backslash_escapes(self, text: str) -> str:
@@ -2314,13 +2318,23 @@ def _outdent(self, text: str) -> str:
23142318
# Remove one level of line-leading tabs or spaces
23152319
return self._outdent_re.sub('', text)
23162320

2317-
def _hash_span(self, text: str) -> str:
2321+
def _hash_span(self, text: str, hash_table: Optional[dict] = None) -> str:
23182322
'''
23192323
Wrapper around `_hash_text` that also adds the hash to `self.hash_spans`,
23202324
meaning it will be automatically unhashed during conversion.
2325+
2326+
Args:
2327+
text: the text to hash
2328+
hash_table: the dict to insert the hash into. If omitted will default to `self.html_spans`
2329+
2330+
Returns:
2331+
The hashed text
23212332
'''
23222333
key = _hash_text(text)
2323-
self.html_spans[key] = text
2334+
if hash_table is not None:
2335+
hash_table[key] = text
2336+
else:
2337+
self.html_spans[key] = text
23242338
return key
23252339

23262340
@staticmethod
@@ -2559,9 +2573,7 @@ def sub(self, match: re.Match) -> str:
25592573

25602574
def sub_hash(self, match: re.Match) -> str:
25612575
substr = match.string[match.start(): match.end()]
2562-
key = _hash_text(substr)
2563-
self.hash_table[key] = substr
2564-
return key
2576+
return self.md._hash_span(substr, self.hash_table)
25652577

25662578
def test(self, text):
25672579
if self.md.order < Stage.ITALIC_AND_BOLD:
@@ -3124,7 +3136,7 @@ def unhash_code(codeblock):
31243136
**formatter_opts)
31253137

31263138
# add back the indent to all lines
3127-
return "\n%s\n" % self.md._uniform_indent(colored, leading_indent, True)
3139+
return self.md._uniform_indent(colored, leading_indent, True)
31283140

31293141
def tags(self, lexer_name: str) -> tuple[str, str]:
31303142
'''
@@ -3149,12 +3161,20 @@ def sub(self, match: re.Match) -> str:
31493161
codeblock = match.group(3)
31503162
codeblock = codeblock[:-1] # drop one trailing newline
31513163

3164+
# figure out what newlines were already surrounding the code block and preserve them in the output
3165+
leading_newlines = match.string[match.start(): match.regs[1][0]]
3166+
trailing_newlines = re.search(r'\n*$', match.group()).group()
3167+
31523168
# Use pygments only if not using the highlightjs-lang extra
31533169
if lexer_name and "highlightjs-lang" not in self.md.extras:
31543170
lexer = self.md._get_pygments_lexer(lexer_name)
31553171
if lexer:
3156-
leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip()))
3157-
return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer)
3172+
leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
3173+
return (
3174+
leading_newlines
3175+
+ self._code_block_with_lexer_sub(codeblock, leading_indent, lexer)
3176+
+ trailing_newlines
3177+
)
31583178

31593179
# Fenced code blocks need to be outdented before encoding, and then reapplied
31603180
leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
@@ -3166,18 +3186,12 @@ def sub(self, match: re.Match) -> str:
31663186

31673187
tags = self.tags(lexer_name)
31683188

3169-
# when not in safe-mode, we convert fenced code blocks before Stage.HASH_HTML, which means the text
3170-
# ends up as `\n\nmd5-...\n\n`, thanks to the hashing stages adding in some newlines
3171-
# in safe mode, we run fenced code blocks AFTER the hashing, so we don't end up with that same
3172-
# `\n\n` wrap. We can correct that here
3173-
surrounding_newlines = '\n\n' if self.md.safe_mode else '\n'
3174-
31753189
return (
3176-
f'{surrounding_newlines}'
3190+
f'{leading_newlines}'
31773191
f'{leading_indent}{tags[0]}'
31783192
f'{codeblock}'
31793193
f'\n{leading_indent}{tags[1]}'
3180-
f'{surrounding_newlines}'
3194+
f'{trailing_newlines}'
31813195
)
31823196

31833197
def run(self, text):
@@ -3296,8 +3310,7 @@ def run(self, text):
32963310
.replace('*', self.md._escape_table['*'])
32973311
.replace('_', self.md._escape_table['_']))
32983312
link = '<a href="{}">{}</a>'.format(escaped_href, text[start:end])
3299-
hash = _hash_text(link)
3300-
link_from_hash[hash] = link
3313+
hash = self.md._hash_span(link, link_from_hash)
33013314
text = text[:start] + hash + text[end:]
33023315
for hash, link in list(link_from_hash.items()):
33033316
text = text.replace(hash, link)
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
<p>&lt;abc&gt;
2-
&lt;abc></p>
2+
&lt;abc>
3+
&lt;why?</p>
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
\<abc\>
2-
\<abc>
2+
\<abc>
3+
\<why?
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1-
<p>This appears to be an incomplete tag, but it's not because it's in a code span.</p>
1+
<p>This appears to be incomplete tags, but they're not because they're in code spans.</p>
22

33
<p>Path: <code>C:\&lt;folder 1&gt;</code></p>
4+
5+
<p>Path: <code>C:\&lt;folder&gt;</code></p>
Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1-
This appears to be an incomplete tag, but it's not because it's in a code span.
1+
This appears to be incomplete tags, but they're not because they're in code spans.
22

3-
Path: `C:\<folder 1>`
3+
Path: `C:\<folder 1>`
4+
5+
Path: `C:\<folder>`

test/tm-cases/safe_mode_fenced_code_joined_to_lists.text

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33

44
```
55
// Some code
6-
```
6+
```
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<ul>
2+
<li>Item 1</li>
3+
<li>Item 2</li>
4+
</ul>
5+
6+
<div class="codehilite">
7+
<pre><span></span><code><span class="c1">// Some code</span>
8+
</code></pre>
9+
</div>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{'safe_mode': 'escape', 'extras': ['fenced-code-blocks']}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
fenced-code-blocks pygments
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
* Item 1
2+
* Item 2
3+
4+
```cpp
5+
// Some code
6+
```

0 commit comments

Comments
 (0)