Skip to content

Commit c643637

Browse files
committed
feat(html): add paragraph_attributes extension
- Introduce `Ext_paragraph_attributes` to control preservation of attributes on paragraph tags. - Enable the extension by default for all HTML-based formats. - The HTML reader now wraps attributed paragraphs in a Div when the `native_divs` extension is enabled. - The HTML writer unwraps the Div back to an attributed p tag. - Add tests to verify attribute handling with and without the new extension.
1 parent 7ad3573 commit c643637

File tree

10 files changed

+57
-10
lines changed

10 files changed

+57
-10
lines changed

src/Text/Pandoc/Extensions.hs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ data Extension =
109109
| Ext_ntb -- ^ ConTeXt Natural Tables
110110
| Ext_old_dashes -- ^ -- = em, - before number = en
111111
| Ext_pandoc_title_block -- ^ Pandoc title block
112+
| Ext_paragraph_attributes-- ^ Preserve attributes on paragraphs. By default, pandoc strips all attributes from paragraphs.
112113
| Ext_pipe_tables -- ^ Pipe tables (as in PHP markdown extra)
113114
| Ext_raw_attribute -- ^ Allow explicit raw blocks/inlines
114115
| Ext_raw_html -- ^ Allow raw HTML
@@ -213,6 +214,7 @@ pandocExtensions = extensionsFromList
213214
[ Ext_footnotes
214215
, Ext_inline_notes
215216
, Ext_pandoc_title_block
217+
, Ext_paragraph_attributes
216218
, Ext_yaml_metadata_block
217219
, Ext_table_captions
218220
, Ext_implicit_figures
@@ -435,7 +437,8 @@ getDefaultExtensions "html" = extensionsFromList
435437
[Ext_auto_identifiers,
436438
Ext_native_divs,
437439
Ext_line_blocks,
438-
Ext_native_spans]
440+
Ext_native_spans,
441+
Ext_paragraph_attributes]
439442
getDefaultExtensions "html4" = getDefaultExtensions "html"
440443
getDefaultExtensions "html5" = getDefaultExtensions "html"
441444
getDefaultExtensions "epub" = extensionsFromList

src/Text/Pandoc/Readers/HTML.hs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ import Text.Pandoc.Error
5959
import Text.Pandoc.Logging
6060
import Text.Pandoc.Options (
6161
Extension (Ext_epub_html_exts, Ext_empty_paragraphs, Ext_native_divs,
62-
Ext_native_spans, Ext_raw_html, Ext_line_blocks, Ext_raw_tex),
62+
Ext_paragraph_attributes, Ext_native_spans, Ext_raw_html,
63+
Ext_line_blocks, Ext_raw_tex),
6364
ReaderOptions (readerExtensions, readerStripComments),
6465
extensionEnabled)
6566
import Text.Pandoc.Parsing hiding ((<|>))
@@ -626,14 +627,12 @@ pPlain = do
626627
-- Helper function for pPara when significant attributes are present
627628
pParaWithWrapper :: PandocMonad m => Attr -> TagParser m Blocks
628629
pParaWithWrapper (ident, classes, kvs) = do
629-
guardEnabled Ext_native_divs -- Ensure native_divs is enabled for this behavior
630630
contents <- trimInlines <$> pInTags "p" inline
631631
(do guardDisabled Ext_empty_paragraphs
632632
guard (null contents)
633633
return mempty) <|> do
634634
let wrapperAttr = ("wrapper", "1")
635-
let finalKVs = wrapperAttr : kvs
636-
let finalAttrs = (ident, classes, finalKVs)
635+
let finalAttrs = (ident, classes, wrapperAttr : kvs)
637636
return $ B.divWith finalAttrs (B.para contents)
638637

639638
-- Helper function for pPara when no significant attributes are present
@@ -651,8 +650,10 @@ pPara = do
651650
let attr@(ident, classes, kvs) = toAttr attr'
652651
-- "Significant" attributes are any id, class, or key-value pair.
653652
let hasSignificantAttributes = not (T.null ident) || not (null classes) || not (null kvs)
654-
655-
if hasSignificantAttributes
653+
exts <- getOption readerExtensions
654+
if hasSignificantAttributes &&
655+
(extensionEnabled Ext_paragraph_attributes exts ||
656+
extensionEnabled Ext_native_divs exts)
656657
then pParaWithWrapper attr
657658
else pParaSimple
658659

src/Text/Pandoc/Writers/HTML.hs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -750,9 +750,11 @@ blockToHtmlInner opts (Para lst) = do
750750
blockToHtmlInner opts (LineBlock lns) = do
751751
htmlLines <- inlineListToHtml opts $ intercalate [LineBreak] lns
752752
return $ H.div ! A.class_ "line-block" $ htmlLines
753-
blockToHtmlInner opts (Div (ident, classes, kvs) [Para pans]) | Just "1" <- lookup "wrapper" kvs = do
753+
blockToHtmlInner opts (Div (ident, classes, kvs) [Para pans])
754+
| Just "1" <- lookup "wrapper" kvs = do
754755
-- This is a paragraph that was wrapped in a Div by the reader
755-
-- Unwrap it back to a <p> tag, transferring attributes from the Div
756+
-- because of the paragraph_attributes extension.
757+
-- Unwrap it back to a <p> tag, transferring attributes from the Div.
756758
let pKVs = filter (\(k,_) -> k /= "wrapper") kvs
757759
let pAttr = (ident, classes, pKVs)
758760
inner <- inlineListToHtml opts pans

test/Tests/Readers/HTML.hs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import Text.Pandoc.Shared (isHeaderBlock)
2323
import Text.Pandoc.Arbitrary ()
2424
import Text.Pandoc.Builder
2525
import Text.Pandoc.Walk (walk)
26+
import Test.Tasty.HUnit (testCase, assertEqual)
2627

2728
html :: Text -> Pandoc
2829
html = purely $ readHtml def
@@ -55,6 +56,16 @@ roundTrip b = d'' == d'''
5556
purely (writeHtml5String def
5657
{ writerWrapText = WrapPreserve })
5758

59+
htmlHtmlTest :: TestName -> Extensions -> Extensions -> Text -> Text -> TestTree
60+
htmlHtmlTest name readerExts writerExts input expected =
61+
testCase name $ do
62+
let readerOpts = def { readerExtensions = readerExts }
63+
let writerOpts = def { writerExtensions = writerExts, writerWrapText = WrapAuto }
64+
actual <- runIOorExplode $ do
65+
pandoc <- readHtml readerOpts input
66+
writeHtml5String writerOpts pandoc
67+
assertEqual name (T.strip expected) (T.strip actual)
68+
5869
tests :: [TestTree]
5970
tests = [ testGroup "base tag"
6071
[ test html "simple" $
@@ -175,7 +186,21 @@ tests = [ testGroup "base tag"
175186
"<p id=\"baz\" align=\"invalid\">Invalid align with id.</p>" =?>
176187
doc (divWith ("baz", [], [("wrapper", "1"), ("align", "invalid")]) (para (text "Invalid align with id.")))
177188
]
178-
, askOption $ \(QuickCheckTests numtests) ->
189+
, testGroup "paragraph-attributes-roundtrip"
190+
[ htmlHtmlTest
191+
"strip attributes when extension is disabled"
192+
(readerExtensions def)
193+
(writerExtensions def)
194+
"<p id=\"foo\" class=\"bar\">Hello</p>"
195+
"<p>Hello</p>"
196+
, htmlHtmlTest
197+
"keep attributes when extension is enabled"
198+
(enableExtension Ext_native_divs $ enableExtension Ext_paragraph_attributes (readerExtensions def))
199+
(enableExtension Ext_paragraph_attributes (writerExtensions def))
200+
"<p id=\"foo\" class=\"bar\">Hello</p>"
201+
"<p id=\"foo\" class=\"bar\">Hello</p>"
202+
]
203+
, askOption $ \(QuickCheckTests numtests) ->
179204
testProperty "Round trip" $
180205
withMaxSuccess (if QuickCheckTests numtests == defaultValue
181206
then 25
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<p>Hallo Welt</p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<p class="foo" id="bar">Hallo Welt</p>
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
description: 'Test paragraph attributes are removed by default (HTML to HTML)'
2+
input: paragraph_attributes_default.html
3+
output: paragraph_attributes_default.expected.html
4+
options:
5+
- --from=html
6+
- --to=html
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<p class="foo" id="bar">Hallo Welt</p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<p class="foo" id="bar">Hallo Welt</p>
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
description: 'Test paragraph attributes are preserved with +paragraph_attributes (HTML to HTML)'
2+
input: paragraph_attributes_enabled.html
3+
output: paragraph_attributes_enabled.expected.html
4+
options:
5+
- --from=html+paragraph_attributes
6+
- --to=html+paragraph_attributes

0 commit comments

Comments
 (0)