datalab-to · u-ashish · Nov 3, 2025 · Nov 14, 2025 · Nov 14, 2025 · Nov 14, 2025
diff --git a/datalab_sdk/models.py b/datalab_sdk/models.py
@@ -7,6 +7,9 @@
 from pathlib import Path
 import json
 import base64
+import webbrowser
+
+from datalab_sdk.utils import generate_spreadsheet_html_viewer
 
 
 @dataclass
@@ -15,6 +18,7 @@ class ProcessingOptions:
     max_pages: Optional[int] = None
     skip_cache: bool = False
     page_range: Optional[str] = None
+    extras: Optional[str] = None
 
     def to_form_data(self) -> Dict[str, Any]:
         """Convert to form data format for API requests"""
@@ -121,6 +125,170 @@ def save_output(
             ) as f:
                 json.dump(self.metadata, f, indent=2)
 
+    def _extract_blocks(self) -> List[Dict[str, Any]]:
+        """Extract all blocks from the nested JSON structure."""
+        blocks = []
+        if isinstance(self.json, dict) and "children" in self.json:
+            for page in self.json.get("children", []):
+                if isinstance(page, dict) and "children" in page:
+                    for block in page.get("children", []):
+                        blocks.append(block)
+        return blocks
+
+    def get_tables_by_page(self) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Extract tables grouped by sheet name.
+
+        Returns:
+            Dictionary mapping sheet names to lists of table blocks.
+            Each table block contains: id, block_type, html, and other metadata.
+        """
+        blocks = self._extract_blocks()
+        sheets: Dict[str, List[Dict[str, Any]]] = {}
+
+        for block in blocks:
+            block_type = block.get("block_type", "")
+            if block_type != "Table":
+                continue
+
+            block_id = block.get("id", "")
+            sheet_name = "unknown"
+
+            # Extract sheet name from block ID (format: /page/SheetName/Table/0)
+            if "/page/" in block_id and "/Table/" in block_id:
+                parts = block_id.split("/")
+                if len(parts) >= 3:
+                    sheet_name = parts[2]
+            elif "/page/" in block_id:
+                parts = block_id.split("/")
+                if len(parts) >= 3:
+                    sheet_name = parts[2]
+
+            if sheet_name not in sheets:
+                sheets[sheet_name] = []
+            sheets[sheet_name].append(block)
+
+        # Sort tables within each sheet by their index in the block ID
+        for sheet_name in sheets:
+            sheets[sheet_name].sort(
+                key=lambda b: self._extract_table_index(b.get("id", ""))
+            )
+
+        return sheets
+
+    def _extract_table_index(self, block_id: str) -> int:
+        """Extract table index from block ID."""
+        if "/Table/" in block_id:
+            parts = block_id.split("/")
+            if len(parts) >= 5:
+                try:
+                    return int(parts[4])
+                except ValueError:
+                    return 0
+        return 0
+
+    def get_table_count(self) -> int:
+        """Get total number of tables across all pages (sheets)."""
+        tables_by_page = self.get_tables_by_page()
+        return sum(len(tables) for tables in tables_by_page.values())
+
+    def generate_html_viewer(self, title: str = "XLSX Tables") -> str:
+        """
+        Generate HTML viewer with tabs for each sheet.
+
+        Args:
+            title: Title to display in the HTML viewer
+
+        Returns:
+            HTML string with tabs per sheet
+        """
+        sheets = self.get_tables_by_page()
+        return generate_spreadsheet_html_viewer(sheets, title)
+
+    def save_html_viewer(
+        self, output_path: Union[str, Path], title: Optional[str] = None
+    ) -> Path:
+        """
+        Save HTML viewer to file.
+
+        Args:
+            output_path: Path where to save the HTML file
+            title: Optional title for the HTML viewer (defaults to filename)
+
+        Returns:
+            Path to the saved HTML file
+        """
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        if title is None:
+            title = output_path.stem
+
+        html_content = self.generate_html_viewer(title=title)
+
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(html_content)
+
+        return output_path
+
+    def open_in_browser(self, html_path: Optional[Union[str, Path]] = None) -> None:
+        """
+        Open HTML viewer in default browser.
+
+        Args:
+            html_path: Optional path to HTML file. If not provided, creates a temporary file.
+        """
+        if html_path is None:
+            import tempfile
+            import os
+
+            # Create temporary file
+            fd, temp_path = tempfile.mkstemp(suffix=".html", prefix="datalab_tables_")
+            os.close(fd)
+            html_path = Path(temp_path)
+            self.save_html_viewer(html_path)
+            # Note: We don't delete the temp file, as browser may need it
+
+        html_path = Path(html_path)
+        if not html_path.exists():
+            raise FileNotFoundError(f"HTML file not found: {html_path}")
+
+        webbrowser.open(f"file://{html_path.absolute()}")
+
+    def save_json(self, output_path: Union[str, Path]) -> Path:
+        """
+        Save raw JSON response to file.
+
+        Args:
+            output_path: Path where to save the JSON file
+
+        Returns:
+            Path to the saved JSON file
+        """
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Ensure .json extension
+        if output_path.suffix != ".json":
+            output_path = output_path.with_suffix(".json")
+
+        with open(output_path, "w", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "success": self.success,
+                    "output_format": self.output_format,
+                    "json": self.json,
+                    "error": self.error,
+                    "page_count": self.page_count,
+                    "status": self.status,
+                },
+                f,
+                indent=2,
+                ensure_ascii=False,
+            )
+
+        return output_path
+
 
 @dataclass
 class WorkflowStep:

diff --git a/datalab_sdk/utils.py b/datalab_sdk/utils.py
@@ -0,0 +1,190 @@
+"""
+Utility functions for the Datalab SDK
+"""
+
+from typing import Dict, List
+
+
+def generate_spreadsheet_html_viewer(
+    sheets: Dict[str, List[dict]], title: str = "XLSX Tables"
+) -> str:
+    """
+    Generate HTML viewer with tabs for each sheet.
+
+    Args:
+        sheets: Dictionary mapping sheet names to lists of table blocks.
+                Each table block should have an 'html' field containing the table HTML.
+        title: Title to display in the HTML viewer
+
+    Returns:
+        HTML string with tabs per sheet
+    """
+    if not sheets:
+        return "<html><body><p>No tables found.</p></body></html>"
+
+    # Generate HTML
+    html_content = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{title} - XLSX Tables</title>
+    <style>
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #f5f5f5;
+        }}
+        .container {{
+            max-width: 1400px;
+            margin: 0 auto;
+            background: white;
+            border-radius: 8px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+            overflow: hidden;
+        }}
+        .header {{
+            padding: 20px;
+            background: #2c3e50;
+            color: white;
+        }}
+        .header h1 {{
+            margin: 0;
+            font-size: 24px;
+        }}
+        .header p {{
+            margin: 8px 0 0 0;
+            opacity: 0.9;
+            font-size: 14px;
+        }}
+        .tabs {{
+            display: flex;
+            background: #ecf0f1;
+            border-bottom: 2px solid #bdc3c7;
+            overflow-x: auto;
+        }}
+        .tab {{
+            padding: 12px 24px;
+            cursor: pointer;
+            background: #ecf0f1;
+            border: none;
+            border-right: 1px solid #bdc3c7;
+            font-size: 14px;
+            font-weight: 500;
+            color: #34495e;
+            transition: background 0.2s;
+            white-space: nowrap;
+        }}
+        .tab:hover {{
+            background: #d5dbdb;
+        }}
+        .tab.active {{
+            background: white;
+            color: #2c3e50;
+            border-bottom: 2px solid #3498db;
+            margin-bottom: -2px;
+        }}
+        .tab-content {{
+            display: none;
+            padding: 20px;
+            overflow-x: auto;
+        }}
+        .tab-content.active {{
+            display: block;
+        }}
+        .table-container {{
+            margin-bottom: 30px;
+        }}
+        .table-header {{
+            font-size: 16px;
+            font-weight: 600;
+            color: #2c3e50;
+            margin-bottom: 12px;
+            padding-bottom: 8px;
+            border-bottom: 2px solid #ecf0f1;
+        }}
+        .table-wrapper {{
+            overflow-x: auto;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+        }}
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+            font-size: 14px;
+        }}
+        table th {{
+            background: #f8f9fa;
+            font-weight: 600;
+            padding: 10px;
+            text-align: left;
+            border-bottom: 2px solid #dee2e6;
+        }}
+        table td {{
+            padding: 8px 10px;
+            border-bottom: 1px solid #e9ecef;
+        }}
+        table tr:hover {{
+            background: #f8f9fa;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>{title}</h1>
+            <p>Extracted {sum(len(tables) for tables in sheets.values())} table(s) from {len(sheets)} sheet(s)</p>
+        </div>
+        <div class="tabs">
+"""
+
+    # Add tabs
+    sheet_names = sorted(sheets.keys())
+    for idx, sheet_name in enumerate(sheet_names):
+        active_class = "active" if idx == 0 else ""
+        html_content += f'            <button class="tab {active_class}" onclick="showTab({idx})">{sheet_name}</button>\n'
+
+    html_content += "        </div>\n"
+
+    # Add tab content
+    for idx, sheet_name in enumerate(sheet_names):
+        active_class = "active" if idx == 0 else ""
+        html_content += f'        <div class="tab-content {active_class}" id="tab-{idx}">\n'
+
+        tables = sheets[sheet_name]
+        for table_idx, block in enumerate(tables):
+            html = block.get("html", "")
+            if not html:
+                continue
+
+            html_content += f'            <div class="table-container">\n'
+            if len(tables) > 1:
+                html_content += f'                <div class="table-header">Table {table_idx + 1}</div>\n'
+            html_content += f'                <div class="table-wrapper">\n'
+            html_content += f'                    {html}\n'
+            html_content += f'                </div>\n'
+            html_content += f'            </div>\n'
+
+        html_content += "        </div>\n"
+
+    html_content += """    </div>
+    <script>
+        function showTab(index) {
+            // Hide all tabs and content
+            const tabs = document.querySelectorAll('.tab');
+            const contents = document.querySelectorAll('.tab-content');
+
+            tabs.forEach(tab => tab.classList.remove('active'));
+            contents.forEach(content => content.classList.remove('active'));
+
+            // Show selected tab and content
+            tabs[index].classList.add('active');
+            contents[index].classList.add('active');
+        }
+    </script>
+</body>
+</html>"""
+
+    return html_content
+