Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions packages/markitdown/src/markitdown/converters/_csv_converter.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,25 @@
import csv
import io
from typing import BinaryIO, Any

from charset_normalizer import from_bytes

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo


ACCEPTED_MIME_TYPE_PREFIXES = [
"text/csv",
"application/csv",
]

ACCEPTED_FILE_EXTENSIONS = [".csv"]


def _escape_markdown_table_cell(cell: str) -> str:
return cell.replace("|", r"\|")
Comment thread
zekiyemeral marked this conversation as resolved.


class CsvConverter(DocumentConverter):
"""
Converts CSV files to Markdown tables.
Expand All @@ -28,11 +36,14 @@ def accepts(
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()

if extension in ACCEPTED_FILE_EXTENSIONS:
return True

for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True

return False

def convert(
Expand All @@ -58,7 +69,8 @@ def convert(
markdown_table = []

# Add header row
markdown_table.append("| " + " | ".join(rows[0]) + " |")
header_row = [_escape_markdown_table_cell(cell) for cell in rows[0]]
markdown_table.append("| " + " | ".join(header_row) + " |")

# Add separator row
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
Expand All @@ -68,10 +80,14 @@ def convert(
# Make sure row has the same number of columns as header
while len(row) < len(rows[0]):
row.append("")

# Truncate if row has more columns than header
row = row[: len(rows[0])]
markdown_table.append("| " + " | ".join(row) + " |")

escaped_row = [_escape_markdown_table_cell(cell) for cell in row]
markdown_table.append("| " + " | ".join(escaped_row) + " |")

result = "\n".join(markdown_table)

return DocumentConverterResult(markdown=result)

Comment on lines 92 to +93
17 changes: 17 additions & 0 deletions packages/markitdown/tests/_test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,23 @@ class FileTestVector(object):
"| 髙橋淳 | 35 | 名古屋 |",
],
must_not_include=[],
),
FileTestVector(
filename="test_pipe_chars.csv",
mimetype="text/csv",
charset="ascii",
url=None,
must_include=[
"| Name | Formula | Notes |",
"| --- | --- | --- |",
"| OR Gate | A \\| B | Pipe character should be escaped |",
"| AND Gate | A & B | Regular value |",
"| Regex | foo\\|bar | Pipe in regex-like value |",
],
must_not_include=[
"| OR Gate | A | B | Pipe character should be escaped |",
"| Regex | foo|bar | Pipe in regex-like value |",
],
),
FileTestVector(
filename="test.json",
Expand Down
4 changes: 4 additions & 0 deletions packages/markitdown/tests/test_files/test_pipe_chars.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Name,Formula,Notes
OR Gate,A | B,Pipe character should be escaped
AND Gate,A & B,Regular value
Regex,foo|bar,Pipe in regex-like value