Allow attr_list quoted values to contain curly braces

oprypin · web-flow · commit 3d8afc6f89e1 · 2024-03-12T11:54:39.000-04:00
How it worked before:

  * Extract the content without allowing any `}` in it, and require that it ends with `}` - for block elements anchored to the end of the line, otherwise not.
  * Parse the content in more detail. No edge cases with `}` can arise. If parsing is interrupted by some unrecognized token, discard the rest of the string.

How it works now:

  * Extract the content *and allow* `}` in it, and require that it ends with `}` - for block elements it's anchored to the end of the line, otherwise not.
  * Parse the content in more detail. Allow `}` only within the quoted parts, otherwise interrupt parsing like for any other unrecognized token.
    If parsing is interrupted, there is remaining unrecognized text. Ideally perhaps we would bail out at this point entirely (and not recognize it as an attr_list), but to preserve historic behavior, any extra text before `}` is just discarded.
    If there is an extra `}` in the remaining text:
      * For block elements: that must mean that the attr_list syntax did not in fact terminate at the end of the line but earlier. So, bail out and do not register any attributes and do not change the original text.
      * For inline elements: that must mean that we just overmatched a bit, but that's OK, we just assign attrs as normal and put the extra text back into the string. As mentioned, any extra text *before* `}` is just discarded.
diff --git a/.spell-dict b/.spell-dict
@@ -146,6 +146,7 @@ Treeprocessor
 Treeprocessors
 tuple
 tuples
+unparsable
 unclosed
 unescape
 unescaping
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -34,8 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * Include `scripts/*.py` in the generated source tarballs (#1430).
 * Ensure lines after heading in loose list are properly detabbed (#1443).
 * Give smarty tree processor higher priority than toc (#1440).
-* Permit carrots (`^`) and square brackets (`]`) but explicitly exclude
+* Permit carets (`^`) and square brackets (`]`) but explicitly exclude
   backslashes (`\`) from abbreviations (#1444).
+* In attribute lists (`attr_list`, `fenced_code`), quoted attribute values are
+  now allowed to contain curly braces (`}`) (#1414).
 
 ## [3.5.2] -- 2024-01-10
 
diff --git a/markdown/extensions/attr_list.py b/markdown/extensions/attr_list.py
@@ -57,17 +57,30 @@ def _handle_word(s, t):
 
 
 _scanner = re.Scanner([
-    (r'[^ =]+=".*?"', _handle_double_quote),
-    (r"[^ =]+='.*?'", _handle_single_quote),
-    (r'[^ =]+=[^ =]+', _handle_key_value),
-    (r'[^ =]+', _handle_word),
+    (r'[^ =}]+=".*?"', _handle_double_quote),
+    (r"[^ =}]+='.*?'", _handle_single_quote),
+    (r'[^ =}]+=[^ =}]+', _handle_key_value),
+    (r'[^ =}]+', _handle_word),
     (r' ', None)
 ])
 
 
-def get_attrs(str: str) -> list[tuple[str, str]]:
-    """ Parse attribute list and return a list of attribute tuples. """
-    return _scanner.scan(str)[0]
+def get_attrs_and_remainder(attrs_string: str) -> tuple[list[tuple[str, str]], str]:
+    """ Parse attribute list and return a list of attribute tuples.
+
+    Additionally, return any text that remained after a curly brace. In typical cases, its presence
+    should mean that the input does not match the intended attribute list syntax.
+    """
+    attrs, remainder = _scanner.scan(attrs_string)
+    # To keep historic behavior, discard all unparsable text prior to '}'.
+    index = remainder.find('}')
+    remainder = remainder[index:] if index != -1 else ''
+    return attrs, remainder
+
+
+def get_attrs(str: str) -> list[tuple[str, str]]:  # pragma: no cover
+    """ Soft-deprecated. Prefer `get_attrs_and_remainder`. """
+    return get_attrs_and_remainder(str)[0]
 
 
 def isheader(elem: Element) -> bool:
@@ -76,7 +89,7 @@ def isheader(elem: Element) -> bool:
 
 class AttrListTreeprocessor(Treeprocessor):
 
-    BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
+    BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}'
     HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
     BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
     INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
@@ -106,49 +119,58 @@ def run(self, doc: Element) -> None:
                         # use tail of last child. no `ul` or `ol`.
                         m = RE.search(elem[-1].tail)
                         if m:
-                            self.assign_attrs(elem, m.group(1))
-                            elem[-1].tail = elem[-1].tail[:m.start()]
+                            if not self.assign_attrs(elem, m.group(1), strict=True):
+                                elem[-1].tail = elem[-1].tail[:m.start()]
                     elif pos is not None and pos > 0 and elem[pos-1].tail:
                         # use tail of last child before `ul` or `ol`
                         m = RE.search(elem[pos-1].tail)
                         if m:
-                            self.assign_attrs(elem, m.group(1))
-                            elem[pos-1].tail = elem[pos-1].tail[:m.start()]
+                            if not self.assign_attrs(elem, m.group(1), strict=True):
+                                elem[pos-1].tail = elem[pos-1].tail[:m.start()]
                     elif elem.text:
                         # use text. `ul` is first child.
                         m = RE.search(elem.text)
                         if m:
-                            self.assign_attrs(elem, m.group(1))
-                            elem.text = elem.text[:m.start()]
+                            if not self.assign_attrs(elem, m.group(1), strict=True):
+                                elem.text = elem.text[:m.start()]
                 elif len(elem) and elem[-1].tail:
                     # has children. Get from tail of last child
                     m = RE.search(elem[-1].tail)
                     if m:
-                        self.assign_attrs(elem, m.group(1))
-                        elem[-1].tail = elem[-1].tail[:m.start()]
-                        if isheader(elem):
-                            # clean up trailing #s
-                            elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
+                        if not self.assign_attrs(elem, m.group(1), strict=True):
+                            elem[-1].tail = elem[-1].tail[:m.start()]
+                            if isheader(elem):
+                                # clean up trailing #s
+                                elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
                 elif elem.text:
                     # no children. Get from text.
                     m = RE.search(elem.text)
                     if m:
-                        self.assign_attrs(elem, m.group(1))
-                        elem.text = elem.text[:m.start()]
-                        if isheader(elem):
-                            # clean up trailing #s
-                            elem.text = elem.text.rstrip('#').rstrip()
+                        if not self.assign_attrs(elem, m.group(1), strict=True):
+                            elem.text = elem.text[:m.start()]
+                            if isheader(elem):
+                                # clean up trailing #s
+                                elem.text = elem.text.rstrip('#').rstrip()
             else:
                 # inline: check for `attrs` at start of tail
                 if elem.tail:
                     m = self.INLINE_RE.match(elem.tail)
                     if m:
-                        self.assign_attrs(elem, m.group(1))
-                        elem.tail = elem.tail[m.end():]
+                        remainder = self.assign_attrs(elem, m.group(1))
+                        elem.tail = elem.tail[m.end():] + remainder
+
+    def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str:
+        """ Assign `attrs` to element.
+
+        If the `attrs_string` has an extra closing curly brace, the remaining text is returned.
+
+        The `strict` argument controls whether to still assign `attrs` if there is a remaining `}`.
+        """
+        attrs, remainder = get_attrs_and_remainder(attrs_string)
+        if strict and remainder:
+            return remainder
 
-    def assign_attrs(self, elem: Element, attrs: str) -> None:
-        """ Assign `attrs` to element. """
-        for k, v in get_attrs(attrs):
+        for k, v in attrs:
             if k == '.':
                 # add to class
                 cls = elem.get('class')
@@ -159,11 +181,13 @@ def assign_attrs(self, elem: Element, attrs: str) -> None:
             else:
                 # assign attribute `k` with `v`
                 elem.set(self.sanitize_name(k), v)
+        # The text that we initially over-matched will be put back.
+        return remainder
 
     def sanitize_name(self, name: str) -> str:
         """
-        Sanitize name as 'an XML Name, minus the ":"'.
-        See https://www.w3.org/TR/REC-xml-names/#NT-NCName
+        Sanitize name as 'an XML Name, minus the `:`.'
+        See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
         """
         return self.NAME_RE.sub('_', name)
 
diff --git a/markdown/extensions/fenced_code.py b/markdown/extensions/fenced_code.py
@@ -25,7 +25,7 @@
 from . import Extension
 from ..preprocessors import Preprocessor
 from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines
-from .attr_list import get_attrs, AttrListExtension
+from .attr_list import get_attrs_and_remainder, AttrListExtension
 from ..util import parseBoolValue
 from ..serializers import _escape_attrib_html
 import re
@@ -56,7 +56,7 @@ class FencedBlockPreprocessor(Preprocessor):
     FENCED_BLOCK_RE = re.compile(
         dedent(r'''
             (?P<fence>^(?:~{3,}|`{3,}))[ ]*                          # opening fence
-            ((\{(?P<attrs>[^\}\n]*)\})|                              # (optional {attrs} or
+            ((\{(?P<attrs>[^\n]*)\})|                                # (optional {attrs} or
             (\.?(?P<lang>[\w#.+-]*)[ ]*)?                            # optional (.)lang
             (hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot)[ ]*)?) # optional hl_lines)
             \n                                                       # newline (end of opening fence)
@@ -94,12 +94,17 @@ def run(self, lines: list[str]) -> list[str]:
             self.checked_for_deps = True
 
         text = "\n".join(lines)
+        index = 0
         while 1:
-            m = self.FENCED_BLOCK_RE.search(text)
+            m = self.FENCED_BLOCK_RE.search(text, index)
             if m:
                 lang, id, classes, config = None, '', [], {}
                 if m.group('attrs'):
-                    id, classes, config = self.handle_attrs(get_attrs(m.group('attrs')))
+                    attrs, remainder = get_attrs_and_remainder(m.group('attrs'))
+                    if remainder:  # Does not have correctly matching curly braces, so the syntax is invalid.
+                        index = m.end('attrs')  # Explicitly skip over this, to prevent an infinite loop.
+                        continue
+                    id, classes, config = self.handle_attrs(attrs)
                     if len(classes):
                         lang = classes.pop(0)
                 else:
@@ -151,6 +156,8 @@ def run(self, lines: list[str]) -> list[str]:
 
                 placeholder = self.md.htmlStash.store(code)
                 text = f'{text[:m.start()]}\n{placeholder}\n{text[m.end():]}'
+                # Continue from after the replaced text in the next iteration.
+                index = m.start() + 1 + len(placeholder)
             else:
                 break
         return text.split("\n")
diff --git a/tests/test_syntax/extensions/test_attr_list.py b/tests/test_syntax/extensions/test_attr_list.py
@@ -23,16 +23,53 @@
 
 
 class TestAttrList(TestCase):
-
     maxDiff = None
+    default_kwargs = {'extensions': ['attr_list']}
 
     # TODO: Move the rest of the `attr_list` tests here.
 
-    def test_empty_list(self):
+    def test_empty_attr_list(self):
         self.assertMarkdownRenders(
             '*foo*{ }',
-            '<p><em>foo</em>{ }</p>',
-            extensions=['attr_list']
+            '<p><em>foo</em>{ }</p>'
+        )
+
+    def test_curly_after_inline(self):
+        self.assertMarkdownRenders(
+            '*inline*{.a} } *text*{.a }}',
+            '<p><em class="a">inline</em> } <em class="a">text</em>}</p>'
+        )
+
+    def test_extra_eq_gets_ignored_inside_curly_inline(self):
+        # Undesired behavior but kept for historic compatibility.
+        self.assertMarkdownRenders(
+            '*inline*{data-test="x" =a} *text*',
+            '<p><em data-test="x">inline</em> <em>text</em></p>'
+        )
+
+    def test_curly_after_block(self):
+        self.assertMarkdownRenders(
+            '# Heading {.a} }',
+            '<h1>Heading {.a} }</h1>'
+        )
+
+    def test_curly_in_single_quote(self):
+        self.assertMarkdownRenders(
+            "# Heading {data-test='{}'}",
+            '<h1 data-test="{}">Heading</h1>'
+        )
+
+    def test_curly_in_double_quote(self):
+        self.assertMarkdownRenders(
+            '# Heading {data-test="{}"}',
+            '<h1 data-test="{}">Heading</h1>'
+        )
+
+    def test_unclosed_quote_ignored(self):
+        # Undesired behavior but kept for historic compatibility.
+        self.assertMarkdownRenders(
+            '# Heading {foo="bar}',
+            '<h1 foo="&quot;bar">Heading</h1>'
         )
 
     def test_table_td(self):
diff --git a/tests/test_syntax/extensions/test_fenced_code.py b/tests/test_syntax/extensions/test_fenced_code.py
@@ -394,6 +394,48 @@ def testFencedCodeEscapedAttrs(self):
             extensions=['fenced_code', 'attr_list']
         )
 
+    def testFencedCodeCurlyInAttrs(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                ``` { data-test="{}" }
+                # Some python code
+                ```
+                '''
+            ),
+            self.dedent(
+                '''
+                <pre><code data-test="{}"># Some python code
+                </code></pre>
+                '''
+            ),
+            extensions=['fenced_code', 'attr_list']
+        )
+
+    def testFencedCodeMismatchedCurlyInAttrs(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                '''
+                ``` { data-test="{}" } }
+                # Some python code
+                ```
+                ```
+                test
+                ```
+                '''
+            ),
+            self.dedent(
+                '''
+                <p>``` { data-test="{}" } }</p>
+                <h1>Some python code</h1>
+                <pre><code></code></pre>
+                <p>test
+                ```</p>
+                '''
+            ),
+            extensions=['fenced_code', 'attr_list']
+        )
+
 
 class TestFencedCodeWithCodehilite(TestCase):