Skip to content

Commit 3d8afc6

Browse files
authored
Allow attr_list quoted values to contain curly braces
How it worked before: * Extract the content without allowing any `}` in it, and require that it ends with `}` - for block elements anchored to the end of the line, otherwise not. * Parse the content in more detail. No edge cases with `}` can arise. If parsing is interrupted by some unrecognized token, discard the rest of the string. How it works now: * Extract the content *and allow* `}` in it, and require that it ends with `}` - for block elements it's anchored to the end of the line, otherwise not. * Parse the content in more detail. Allow `}` only within the quoted parts, otherwise interrupt parsing like for any other unrecognized token. If parsing is interrupted, there is remaining unrecognized text. Ideally perhaps we would bail out at this point entirely (and not recognize it as an attr_list), but to preserve historic behavior, any extra text before `}` is just discarded. If there is an extra `}` in the remaining text: * For block elements: that must mean that the attr_list syntax did not in fact terminate at the end of the line but earlier. So, bail out and do not register any attributes and do not change the original text. * For inline elements: that must mean that we just overmatched a bit, but that's OK, we just assign attrs as normal and put the extra text back into the string. As mentioned, any extra text *before* `}` is just discarded.
1 parent 9edba85 commit 3d8afc6

File tree

6 files changed

+153
-40
lines changed

6 files changed

+153
-40
lines changed

.spell-dict

+1
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ Treeprocessor
146146
Treeprocessors
147147
tuple
148148
tuples
149+
unparsable
149150
unclosed
150151
unescape
151152
unescaping

docs/changelog.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
3434
* Include `scripts/*.py` in the generated source tarballs (#1430).
3535
* Ensure lines after heading in loose list are properly detabbed (#1443).
3636
* Give smarty tree processor higher priority than toc (#1440).
37-
* Permit carrots (`^`) and square brackets (`]`) but explicitly exclude
37+
* Permit carets (`^`) and square brackets (`]`) but explicitly exclude
3838
backslashes (`\`) from abbreviations (#1444).
39+
* In attribute lists (`attr_list`, `fenced_code`), quoted attribute values are
40+
now allowed to contain curly braces (`}`) (#1414).
3941

4042
## [3.5.2] -- 2024-01-10
4143

markdown/extensions/attr_list.py

+55-31
Original file line numberDiff line numberDiff line change
@@ -57,17 +57,30 @@ def _handle_word(s, t):
5757

5858

5959
_scanner = re.Scanner([
60-
(r'[^ =]+=".*?"', _handle_double_quote),
61-
(r"[^ =]+='.*?'", _handle_single_quote),
62-
(r'[^ =]+=[^ =]+', _handle_key_value),
63-
(r'[^ =]+', _handle_word),
60+
(r'[^ =}]+=".*?"', _handle_double_quote),
61+
(r"[^ =}]+='.*?'", _handle_single_quote),
62+
(r'[^ =}]+=[^ =}]+', _handle_key_value),
63+
(r'[^ =}]+', _handle_word),
6464
(r' ', None)
6565
])
6666

6767

68-
def get_attrs(str: str) -> list[tuple[str, str]]:
69-
""" Parse attribute list and return a list of attribute tuples. """
70-
return _scanner.scan(str)[0]
68+
def get_attrs_and_remainder(attrs_string: str) -> tuple[list[tuple[str, str]], str]:
69+
""" Parse attribute list and return a list of attribute tuples.
70+
71+
Additionally, return any text that remained after a curly brace. In typical cases, its presence
72+
should mean that the input does not match the intended attribute list syntax.
73+
"""
74+
attrs, remainder = _scanner.scan(attrs_string)
75+
# To keep historic behavior, discard all unparsable text prior to '}'.
76+
index = remainder.find('}')
77+
remainder = remainder[index:] if index != -1 else ''
78+
return attrs, remainder
79+
80+
81+
def get_attrs(str: str) -> list[tuple[str, str]]: # pragma: no cover
82+
""" Soft-deprecated. Prefer `get_attrs_and_remainder`. """
83+
return get_attrs_and_remainder(str)[0]
7184

7285

7386
def isheader(elem: Element) -> bool:
@@ -76,7 +89,7 @@ def isheader(elem: Element) -> bool:
7689

7790
class AttrListTreeprocessor(Treeprocessor):
7891

79-
BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
92+
BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}'
8093
HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
8194
BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
8295
INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
@@ -106,49 +119,58 @@ def run(self, doc: Element) -> None:
106119
# use tail of last child. no `ul` or `ol`.
107120
m = RE.search(elem[-1].tail)
108121
if m:
109-
self.assign_attrs(elem, m.group(1))
110-
elem[-1].tail = elem[-1].tail[:m.start()]
122+
if not self.assign_attrs(elem, m.group(1), strict=True):
123+
elem[-1].tail = elem[-1].tail[:m.start()]
111124
elif pos is not None and pos > 0 and elem[pos-1].tail:
112125
# use tail of last child before `ul` or `ol`
113126
m = RE.search(elem[pos-1].tail)
114127
if m:
115-
self.assign_attrs(elem, m.group(1))
116-
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
128+
if not self.assign_attrs(elem, m.group(1), strict=True):
129+
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
117130
elif elem.text:
118131
# use text. `ul` is first child.
119132
m = RE.search(elem.text)
120133
if m:
121-
self.assign_attrs(elem, m.group(1))
122-
elem.text = elem.text[:m.start()]
134+
if not self.assign_attrs(elem, m.group(1), strict=True):
135+
elem.text = elem.text[:m.start()]
123136
elif len(elem) and elem[-1].tail:
124137
# has children. Get from tail of last child
125138
m = RE.search(elem[-1].tail)
126139
if m:
127-
self.assign_attrs(elem, m.group(1))
128-
elem[-1].tail = elem[-1].tail[:m.start()]
129-
if isheader(elem):
130-
# clean up trailing #s
131-
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
140+
if not self.assign_attrs(elem, m.group(1), strict=True):
141+
elem[-1].tail = elem[-1].tail[:m.start()]
142+
if isheader(elem):
143+
# clean up trailing #s
144+
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
132145
elif elem.text:
133146
# no children. Get from text.
134147
m = RE.search(elem.text)
135148
if m:
136-
self.assign_attrs(elem, m.group(1))
137-
elem.text = elem.text[:m.start()]
138-
if isheader(elem):
139-
# clean up trailing #s
140-
elem.text = elem.text.rstrip('#').rstrip()
149+
if not self.assign_attrs(elem, m.group(1), strict=True):
150+
elem.text = elem.text[:m.start()]
151+
if isheader(elem):
152+
# clean up trailing #s
153+
elem.text = elem.text.rstrip('#').rstrip()
141154
else:
142155
# inline: check for `attrs` at start of tail
143156
if elem.tail:
144157
m = self.INLINE_RE.match(elem.tail)
145158
if m:
146-
self.assign_attrs(elem, m.group(1))
147-
elem.tail = elem.tail[m.end():]
159+
remainder = self.assign_attrs(elem, m.group(1))
160+
elem.tail = elem.tail[m.end():] + remainder
161+
162+
def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str:
163+
""" Assign `attrs` to element.
164+
165+
If the `attrs_string` has an extra closing curly brace, the remaining text is returned.
166+
167+
The `strict` argument controls whether to still assign `attrs` if there is a remaining `}`.
168+
"""
169+
attrs, remainder = get_attrs_and_remainder(attrs_string)
170+
if strict and remainder:
171+
return remainder
148172

149-
def assign_attrs(self, elem: Element, attrs: str) -> None:
150-
""" Assign `attrs` to element. """
151-
for k, v in get_attrs(attrs):
173+
for k, v in attrs:
152174
if k == '.':
153175
# add to class
154176
cls = elem.get('class')
@@ -159,11 +181,13 @@ def assign_attrs(self, elem: Element, attrs: str) -> None:
159181
else:
160182
# assign attribute `k` with `v`
161183
elem.set(self.sanitize_name(k), v)
184+
# The text that we initially over-matched will be put back.
185+
return remainder
162186

163187
def sanitize_name(self, name: str) -> str:
164188
"""
165-
Sanitize name as 'an XML Name, minus the ":"'.
166-
See https://www.w3.org/TR/REC-xml-names/#NT-NCName
189+
Sanitize name as 'an XML Name, minus the `:`.'
190+
See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
167191
"""
168192
return self.NAME_RE.sub('_', name)
169193

markdown/extensions/fenced_code.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from . import Extension
2626
from ..preprocessors import Preprocessor
2727
from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines
28-
from .attr_list import get_attrs, AttrListExtension
28+
from .attr_list import get_attrs_and_remainder, AttrListExtension
2929
from ..util import parseBoolValue
3030
from ..serializers import _escape_attrib_html
3131
import re
@@ -56,7 +56,7 @@ class FencedBlockPreprocessor(Preprocessor):
5656
FENCED_BLOCK_RE = re.compile(
5757
dedent(r'''
5858
(?P<fence>^(?:~{3,}|`{3,}))[ ]* # opening fence
59-
((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or
59+
((\{(?P<attrs>[^\n]*)\})| # (optional {attrs} or
6060
(\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang
6161
(hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot)[ ]*)?) # optional hl_lines)
6262
\n # newline (end of opening fence)
@@ -94,12 +94,17 @@ def run(self, lines: list[str]) -> list[str]:
9494
self.checked_for_deps = True
9595

9696
text = "\n".join(lines)
97+
index = 0
9798
while 1:
98-
m = self.FENCED_BLOCK_RE.search(text)
99+
m = self.FENCED_BLOCK_RE.search(text, index)
99100
if m:
100101
lang, id, classes, config = None, '', [], {}
101102
if m.group('attrs'):
102-
id, classes, config = self.handle_attrs(get_attrs(m.group('attrs')))
103+
attrs, remainder = get_attrs_and_remainder(m.group('attrs'))
104+
if remainder: # Does not have correctly matching curly braces, so the syntax is invalid.
105+
index = m.end('attrs') # Explicitly skip over this, to prevent an infinite loop.
106+
continue
107+
id, classes, config = self.handle_attrs(attrs)
103108
if len(classes):
104109
lang = classes.pop(0)
105110
else:
@@ -151,6 +156,8 @@ def run(self, lines: list[str]) -> list[str]:
151156

152157
placeholder = self.md.htmlStash.store(code)
153158
text = f'{text[:m.start()]}\n{placeholder}\n{text[m.end():]}'
159+
# Continue from after the replaced text in the next iteration.
160+
index = m.start() + 1 + len(placeholder)
154161
else:
155162
break
156163
return text.split("\n")

tests/test_syntax/extensions/test_attr_list.py

+41-4
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,53 @@
2323

2424

2525
class TestAttrList(TestCase):
26-
2726
maxDiff = None
27+
default_kwargs = {'extensions': ['attr_list']}
2828

2929
# TODO: Move the rest of the `attr_list` tests here.
3030

31-
def test_empty_list(self):
31+
def test_empty_attr_list(self):
3232
self.assertMarkdownRenders(
3333
'*foo*{ }',
34-
'<p><em>foo</em>{ }</p>',
35-
extensions=['attr_list']
34+
'<p><em>foo</em>{ }</p>'
35+
)
36+
37+
def test_curly_after_inline(self):
38+
self.assertMarkdownRenders(
39+
'*inline*{.a} } *text*{.a }}',
40+
'<p><em class="a">inline</em> } <em class="a">text</em>}</p>'
41+
)
42+
43+
def test_extra_eq_gets_ignored_inside_curly_inline(self):
44+
# Undesired behavior but kept for historic compatibility.
45+
self.assertMarkdownRenders(
46+
'*inline*{data-test="x" =a} *text*',
47+
'<p><em data-test="x">inline</em> <em>text</em></p>'
48+
)
49+
50+
def test_curly_after_block(self):
51+
self.assertMarkdownRenders(
52+
'# Heading {.a} }',
53+
'<h1>Heading {.a} }</h1>'
54+
)
55+
56+
def test_curly_in_single_quote(self):
57+
self.assertMarkdownRenders(
58+
"# Heading {data-test='{}'}",
59+
'<h1 data-test="{}">Heading</h1>'
60+
)
61+
62+
def test_curly_in_double_quote(self):
63+
self.assertMarkdownRenders(
64+
'# Heading {data-test="{}"}',
65+
'<h1 data-test="{}">Heading</h1>'
66+
)
67+
68+
def test_unclosed_quote_ignored(self):
69+
# Undesired behavior but kept for historic compatibility.
70+
self.assertMarkdownRenders(
71+
'# Heading {foo="bar}',
72+
'<h1 foo="&quot;bar">Heading</h1>'
3673
)
3774

3875
def test_table_td(self):

tests/test_syntax/extensions/test_fenced_code.py

+42
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,48 @@ def testFencedCodeEscapedAttrs(self):
394394
extensions=['fenced_code', 'attr_list']
395395
)
396396

397+
def testFencedCodeCurlyInAttrs(self):
398+
self.assertMarkdownRenders(
399+
self.dedent(
400+
'''
401+
``` { data-test="{}" }
402+
# Some python code
403+
```
404+
'''
405+
),
406+
self.dedent(
407+
'''
408+
<pre><code data-test="{}"># Some python code
409+
</code></pre>
410+
'''
411+
),
412+
extensions=['fenced_code', 'attr_list']
413+
)
414+
415+
def testFencedCodeMismatchedCurlyInAttrs(self):
416+
self.assertMarkdownRenders(
417+
self.dedent(
418+
'''
419+
``` { data-test="{}" } }
420+
# Some python code
421+
```
422+
```
423+
test
424+
```
425+
'''
426+
),
427+
self.dedent(
428+
'''
429+
<p>``` { data-test="{}" } }</p>
430+
<h1>Some python code</h1>
431+
<pre><code></code></pre>
432+
<p>test
433+
```</p>
434+
'''
435+
),
436+
extensions=['fenced_code', 'attr_list']
437+
)
438+
397439

398440
class TestFencedCodeWithCodehilite(TestCase):
399441

0 commit comments

Comments
 (0)