Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 168 additions & 0 deletions datalab_sdk/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from pathlib import Path
import json
import base64
import webbrowser

from datalab_sdk.utils import generate_spreadsheet_html_viewer


@dataclass
Expand All @@ -15,6 +18,7 @@ class ProcessingOptions:
max_pages: Optional[int] = None
skip_cache: bool = False
page_range: Optional[str] = None
extras: Optional[str] = None

def to_form_data(self) -> Dict[str, Any]:
"""Convert to form data format for API requests"""
Expand Down Expand Up @@ -121,6 +125,170 @@ def save_output(
) as f:
json.dump(self.metadata, f, indent=2)

def _extract_blocks(self) -> List[Dict[str, Any]]:
"""Extract all blocks from the nested JSON structure."""
blocks = []
if isinstance(self.json, dict) and "children" in self.json:
for page in self.json.get("children", []):
if isinstance(page, dict) and "children" in page:
for block in page.get("children", []):
blocks.append(block)
return blocks

def get_tables_by_page(self) -> Dict[str, List[Dict[str, Any]]]:
"""
Extract tables grouped by sheet name.

Returns:
Dictionary mapping sheet names to lists of table blocks.
Each table block contains: id, block_type, html, and other metadata.
"""
blocks = self._extract_blocks()
sheets: Dict[str, List[Dict[str, Any]]] = {}

for block in blocks:
block_type = block.get("block_type", "")
if block_type != "Table":
continue

block_id = block.get("id", "")
sheet_name = "unknown"

# Extract sheet name from block ID (format: /page/SheetName/Table/0)
if "/page/" in block_id and "/Table/" in block_id:
parts = block_id.split("/")
if len(parts) >= 3:
sheet_name = parts[2]
elif "/page/" in block_id:
parts = block_id.split("/")
if len(parts) >= 3:
sheet_name = parts[2]

if sheet_name not in sheets:
sheets[sheet_name] = []
sheets[sheet_name].append(block)

# Sort tables within each sheet by their index in the block ID
for sheet_name in sheets:
sheets[sheet_name].sort(
key=lambda b: self._extract_table_index(b.get("id", ""))
)

return sheets

def _extract_table_index(self, block_id: str) -> int:
"""Extract table index from block ID."""
if "/Table/" in block_id:
parts = block_id.split("/")
if len(parts) >= 5:
try:
return int(parts[4])
except ValueError:
return 0
return 0

def get_table_count(self) -> int:
"""Get total number of tables across all pages (sheets)."""
tables_by_page = self.get_tables_by_page()
return sum(len(tables) for tables in tables_by_page.values())

def generate_html_viewer(self, title: str = "XLSX Tables") -> str:
"""
Generate HTML viewer with tabs for each sheet.

Args:
title: Title to display in the HTML viewer

Returns:
HTML string with tabs per sheet
"""
sheets = self.get_tables_by_page()
return generate_spreadsheet_html_viewer(sheets, title)

def save_html_viewer(
self, output_path: Union[str, Path], title: Optional[str] = None
) -> Path:
"""
Save HTML viewer to file.

Args:
output_path: Path where to save the HTML file
title: Optional title for the HTML viewer (defaults to filename)

Returns:
Path to the saved HTML file
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)

if title is None:
title = output_path.stem

html_content = self.generate_html_viewer(title=title)

with open(output_path, "w", encoding="utf-8") as f:
f.write(html_content)

return output_path

def open_in_browser(self, html_path: Optional[Union[str, Path]] = None) -> None:
"""
Open HTML viewer in default browser.

Args:
html_path: Optional path to HTML file. If not provided, creates a temporary file.
"""
if html_path is None:
import tempfile
import os

# Create temporary file
fd, temp_path = tempfile.mkstemp(suffix=".html", prefix="datalab_tables_")
os.close(fd)
html_path = Path(temp_path)
self.save_html_viewer(html_path)
# Note: We don't delete the temp file, as browser may need it

html_path = Path(html_path)
if not html_path.exists():
raise FileNotFoundError(f"HTML file not found: {html_path}")

webbrowser.open(f"file://{html_path.absolute()}")

def save_json(self, output_path: Union[str, Path]) -> Path:
"""
Save raw JSON response to file.

Args:
output_path: Path where to save the JSON file

Returns:
Path to the saved JSON file
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)

# Ensure .json extension
if output_path.suffix != ".json":
output_path = output_path.with_suffix(".json")

with open(output_path, "w", encoding="utf-8") as f:
json.dump(
{
"success": self.success,
"output_format": self.output_format,
"json": self.json,
"error": self.error,
"page_count": self.page_count,
"status": self.status,
},
f,
indent=2,
ensure_ascii=False,
)

return output_path


@dataclass
class WorkflowStep:
Expand Down
190 changes: 190 additions & 0 deletions datalab_sdk/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""
Utility functions for the Datalab SDK
"""

from typing import Dict, List


def generate_spreadsheet_html_viewer(
sheets: Dict[str, List[dict]], title: str = "XLSX Tables"
) -> str:
"""
Generate HTML viewer with tabs for each sheet.

Args:
sheets: Dictionary mapping sheet names to lists of table blocks.
Each table block should have an 'html' field containing the table HTML.
title: Title to display in the HTML viewer

Returns:
HTML string with tabs per sheet
"""
if not sheets:
return "<html><body><p>No tables found.</p></body></html>"

# Generate HTML
html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title} - XLSX Tables</title>
<style>
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}}
.container {{
max-width: 1400px;
margin: 0 auto;
background: white;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
overflow: hidden;
}}
.header {{
padding: 20px;
background: #2c3e50;
color: white;
}}
.header h1 {{
margin: 0;
font-size: 24px;
}}
.header p {{
margin: 8px 0 0 0;
opacity: 0.9;
font-size: 14px;
}}
.tabs {{
display: flex;
background: #ecf0f1;
border-bottom: 2px solid #bdc3c7;
overflow-x: auto;
}}
.tab {{
padding: 12px 24px;
cursor: pointer;
background: #ecf0f1;
border: none;
border-right: 1px solid #bdc3c7;
font-size: 14px;
font-weight: 500;
color: #34495e;
transition: background 0.2s;
white-space: nowrap;
}}
.tab:hover {{
background: #d5dbdb;
}}
.tab.active {{
background: white;
color: #2c3e50;
border-bottom: 2px solid #3498db;
margin-bottom: -2px;
}}
.tab-content {{
display: none;
padding: 20px;
overflow-x: auto;
}}
.tab-content.active {{
display: block;
}}
.table-container {{
margin-bottom: 30px;
}}
.table-header {{
font-size: 16px;
font-weight: 600;
color: #2c3e50;
margin-bottom: 12px;
padding-bottom: 8px;
border-bottom: 2px solid #ecf0f1;
}}
.table-wrapper {{
overflow-x: auto;
border: 1px solid #ddd;
border-radius: 4px;
}}
table {{
width: 100%;
border-collapse: collapse;
font-size: 14px;
}}
table th {{
background: #f8f9fa;
font-weight: 600;
padding: 10px;
text-align: left;
border-bottom: 2px solid #dee2e6;
}}
table td {{
padding: 8px 10px;
border-bottom: 1px solid #e9ecef;
}}
table tr:hover {{
background: #f8f9fa;
}}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>{title}</h1>
<p>Extracted {sum(len(tables) for tables in sheets.values())} table(s) from {len(sheets)} sheet(s)</p>
</div>
<div class="tabs">
"""

# Add tabs
sheet_names = sorted(sheets.keys())
for idx, sheet_name in enumerate(sheet_names):
active_class = "active" if idx == 0 else ""
html_content += f' <button class="tab {active_class}" onclick="showTab({idx})">{sheet_name}</button>\n'

html_content += " </div>\n"

# Add tab content
for idx, sheet_name in enumerate(sheet_names):
active_class = "active" if idx == 0 else ""
html_content += f' <div class="tab-content {active_class}" id="tab-{idx}">\n'

tables = sheets[sheet_name]
for table_idx, block in enumerate(tables):
html = block.get("html", "")
if not html:
continue

html_content += f' <div class="table-container">\n'
if len(tables) > 1:
html_content += f' <div class="table-header">Table {table_idx + 1}</div>\n'
html_content += f' <div class="table-wrapper">\n'
html_content += f' {html}\n'
html_content += f' </div>\n'
html_content += f' </div>\n'

html_content += " </div>\n"

html_content += """ </div>
<script>
function showTab(index) {
// Hide all tabs and content
const tabs = document.querySelectorAll('.tab');
const contents = document.querySelectorAll('.tab-content');

tabs.forEach(tab => tab.classList.remove('active'));
contents.forEach(content => content.classList.remove('active'));

// Show selected tab and content
tabs[index].classList.add('active');
contents[index].classList.add('active');
}
</script>
</body>
</html>"""

return html_content

Loading