From 0c9a79f0838991ca3e15fd51e759030d3b61f661 Mon Sep 17 00:00:00 2001 From: zekiye <111131603+zekiyemeral@users.noreply.github.com> Date: Wed, 3 Jun 2026 17:50:55 +0300 Subject: [PATCH 1/2] fix: escape pipe characters in CSV table cells --- .../markitdown/converters/_csv_converter.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 7e9631e1b..21183050a 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -1,17 +1,25 @@ import csv import io from typing import BinaryIO, Any + from charset_normalizer import from_bytes + from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo + ACCEPTED_MIME_TYPE_PREFIXES = [ "text/csv", "application/csv", ] + ACCEPTED_FILE_EXTENSIONS = [".csv"] +def _escape_markdown_table_cell(cell: str) -> str: + return cell.replace("|", r"\|") + + class CsvConverter(DocumentConverter): """ Converts CSV files to Markdown tables. @@ -28,11 +36,14 @@ def accepts( ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() + if extension in ACCEPTED_FILE_EXTENSIONS: return True + for prefix in ACCEPTED_MIME_TYPE_PREFIXES: if mimetype.startswith(prefix): return True + return False def convert( @@ -58,7 +69,8 @@ def convert( markdown_table = [] # Add header row - markdown_table.append("| " + " | ".join(rows[0]) + " |") + header_row = [_escape_markdown_table_cell(cell) for cell in rows[0]] + markdown_table.append("| " + " | ".join(header_row) + " |") # Add separator row markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |") @@ -68,10 +80,14 @@ def convert( # Make sure row has the same number of columns as header while len(row) < len(rows[0]): row.append("") + # Truncate if row has more columns than header row = row[: len(rows[0])] - markdown_table.append("| " + " | ".join(row) + " |") + + escaped_row = [_escape_markdown_table_cell(cell) for cell in row] + markdown_table.append("| " + " | ".join(escaped_row) + " |") result = "\n".join(markdown_table) return DocumentConverterResult(markdown=result) + \ No newline at end of file From 226668c05b2bba95b586403241bdff016bcc3896 Mon Sep 17 00:00:00 2001 From: zekiye <111131603+zekiyemeral@users.noreply.github.com> Date: Wed, 3 Jun 2026 18:46:26 +0300 Subject: [PATCH 2/2] test: add regression coverage for CSV pipe escaping --- packages/markitdown/tests/_test_vectors.py | 17 +++++++++++++++++ .../tests/test_files/test_pipe_chars.csv | 4 ++++ 2 files changed, 21 insertions(+) create mode 100644 packages/markitdown/tests/test_files/test_pipe_chars.csv diff --git a/packages/markitdown/tests/_test_vectors.py b/packages/markitdown/tests/_test_vectors.py index 74fa9bd0a..a30f84d35 100644 --- a/packages/markitdown/tests/_test_vectors.py +++ b/packages/markitdown/tests/_test_vectors.py @@ -151,6 +151,23 @@ class FileTestVector(object): "| 髙橋淳 | 35 | 名古屋 |", ], must_not_include=[], + ), + FileTestVector( + filename="test_pipe_chars.csv", + mimetype="text/csv", + charset="ascii", + url=None, + must_include=[ + "| Name | Formula | Notes |", + "| --- | --- | --- |", + "| OR Gate | A \\| B | Pipe character should be escaped |", + "| AND Gate | A & B | Regular value |", + "| Regex | foo\\|bar | Pipe in regex-like value |", + ], + must_not_include=[ + "| OR Gate | A | B | Pipe character should be escaped |", + "| Regex | foo|bar | Pipe in regex-like value |", + ], ), FileTestVector( filename="test.json", diff --git a/packages/markitdown/tests/test_files/test_pipe_chars.csv b/packages/markitdown/tests/test_files/test_pipe_chars.csv new file mode 100644 index 000000000..769679291 --- /dev/null +++ b/packages/markitdown/tests/test_files/test_pipe_chars.csv @@ -0,0 +1,4 @@ +Name,Formula,Notes +OR Gate,A | B,Pipe character should be escaped +AND Gate,A & B,Regular value +Regex,foo|bar,Pipe in regex-like value \ No newline at end of file