diff --git a/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py b/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py index 522f562..5e453be 100644 --- a/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py +++ b/src/fairscape_cli/datasheet_builder/rocrate/datasheet_generator.py @@ -37,6 +37,7 @@ SubcratesSectionGenerator, PreviewGenerator ) +from .summary_generator import SummarySectionGenerator def get_directory_size(directory): @@ -80,6 +81,7 @@ def __init__(self, json_path: Path, template_dir: Path, published: bool = False) self.distribution_generator = DistributionSectionGenerator(self.env) self.subcrates_generator = SubcratesSectionGenerator(self.env) self.preview_generator = PreviewGenerator(self.env) + self.summary_generator = SummarySectionGenerator(self.env) with open(self.json_path, 'r') as f: crate_dict = json.load(f) @@ -279,17 +281,20 @@ def save_datasheet(self, output_path: Optional[Path] = None) -> Path: output_path = Path(output_path) datasheet = self.convert_main_sections() - + + summary_html = self.summary_generator.generate(self.main_crate, output_dir=self.base_dir) + overview_html = self.overview_generator.generate(datasheet.overview, self.published) use_cases_html = self.use_cases_generator.generate(datasheet.use_cases) distribution_html = self.distribution_generator.generate(datasheet.distribution) subcrates_html = self.subcrates_generator.generate(datasheet.composition, self.published) - + base_template = self.env.get_template('base.html') - + context = { 'title': datasheet.overview.title if datasheet.overview else "Untitled RO-Crate", 'version': datasheet.overview.version if datasheet.overview else "", + 'summary_section': summary_html, 'overview_section': overview_html, 'use_cases_section': use_cases_html, 'distribution_section': distribution_html, diff --git a/src/fairscape_cli/datasheet_builder/rocrate/section_generators.py b/src/fairscape_cli/datasheet_builder/rocrate/section_generators.py index bb07bc0..fb37409 100644 --- a/src/fairscape_cli/datasheet_builder/rocrate/section_generators.py +++ b/src/fairscape_cli/datasheet_builder/rocrate/section_generators.py @@ -94,19 +94,37 @@ def generate(self, overview: Optional[OverviewSection], published: bool = False) class UseCasesSectionGenerator(SectionGenerator): """Convert UseCasesSection pydantic model to HTML.""" - + def generate(self, use_cases: Optional[UseCasesSection]) -> str: if not use_cases: return "" - + context = { 'intended_uses': use_cases.intended_use or "", 'limitations': use_cases.limitations or "", 'prohibited_uses': use_cases.prohibited_uses or "", 'maintenance_plan': use_cases.maintenance_plan or "", - 'potential_bias': use_cases.potential_sources_of_bias or "" + 'potential_bias': use_cases.potential_sources_of_bias or "", + + # Additional RAI fields + 'data_collection': use_cases.data_collection or "", + 'data_collection_type': use_cases.data_collection_type or "", + 'data_collection_missing_data': use_cases.data_collection_missing_data or "", + 'data_collection_raw_data': use_cases.data_collection_raw_data or "", + 'data_collection_timeframe': use_cases.data_collection_timeframe or "", + 'data_imputation_protocol': use_cases.data_imputation_protocol or "", + 'data_manipulation_protocol': use_cases.data_manipulation_protocol or "", + 'data_preprocessing_protocol': use_cases.data_preprocessing_protocol or "", + 'data_annotation_protocol': use_cases.data_annotation_protocol or "", + 'data_annotation_platform': use_cases.data_annotation_platform or "", + 'data_annotation_analysis': use_cases.data_annotation_analysis or "", + 'personal_sensitive_information': use_cases.personal_sensitive_information or "", + 'data_social_impact': use_cases.data_social_impact or "", + 'annotations_per_item': use_cases.annotations_per_item or "", + 'annotator_demographics': use_cases.annotator_demographics or "", + 'machine_annotation_tools': use_cases.machine_annotation_tools or "", } - + return super().generate('sections/use_cases.html', **context) diff --git a/src/fairscape_cli/datasheet_builder/rocrate/summary_generator.py b/src/fairscape_cli/datasheet_builder/rocrate/summary_generator.py new file mode 100644 index 0000000..71ebaa0 --- /dev/null +++ b/src/fairscape_cli/datasheet_builder/rocrate/summary_generator.py @@ -0,0 +1,217 @@ +""" +Summary section generator for datasheet. +Generates the executive summary with AI-Readiness score. +""" +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple +from jinja2 import Environment + +from fairscape_models.rocrate import ROCrateV1_2 +from fairscape_models.conversion.mapping.AIReady import score_rocrate +from fairscape_models.conversion.models.AIReady import AIReadyScore + + +@dataclass +class SummaryData: + """Data extracted from RO-Crate for the summary section.""" + name: str + description: str + total_size_formatted: str = "" + total_entities: int = 0 + dataset_count: int = 0 + computation_count: int = 0 + software_count: int = 0 + formats: List[str] = field(default_factory=list) + + +@dataclass +class AIReadyCategory: + """A single AI-Ready score category.""" + label: str + earned: int + possible: int + percentage: float + color: str + + +@dataclass +class AIReadyScoreData: + """AI-Ready score data for visualization.""" + categories: List[AIReadyCategory] + total_earned: int + total_possible: int + total_percentage: float + total_color: str + + +class SummarySectionGenerator: + """Generate the executive summary section with AI-Readiness score.""" + + CATEGORY_MAP = { + "fairness": ("Fairness", ["findable", "accessible", "interoperable", "reusable"]), + "provenance": ("Provenance", ["transparent", "traceable", "interpretable", "key_actors_identified"]), + "characterization": ("Characterization", ["semantics", "statistics", "standards", "potential_sources_of_bias", "data_quality"]), + "pre_model_explainability": ("Explainability", ["data_documentation_template", "fit_for_purpose", "verifiable"]), + "ethics": ("Ethics", ["ethically_acquired", "ethically_managed", "ethically_disseminated", "secure"]), + "sustainability": ("Sustainability", ["persistent", "domain_appropriate", "well_governed", "associated"]), + "computability": ("Computability", ["standardized", "computationally_accessible", "portable", "contextualized"]), + } + + def __init__(self, template_engine: Environment): + self.template_engine = template_engine + + @staticmethod + def _get_color(percentage: float) -> str: + """Return color based on percentage score.""" + if percentage >= 75: + return "#4CAF50" + elif percentage >= 50: + return "#8BC34A" + elif percentage >= 25: + return "#FFC107" + return "#f44336" + + def extract_summary_data(self, crate: ROCrateV1_2) -> SummaryData: + """Extract summary statistics from an RO-Crate.""" + root_data = crate.metadataGraph[1].model_dump(by_alias=True) if len(crate.metadataGraph) > 1 else {} + + + size_str = root_data.get("contentSize", "") + if not size_str: + size_bytes = root_data.get("evi:totalContentSizeBytes", 0) + if size_bytes: + size_str = self._format_size(size_bytes) + + formats = root_data.get("evi:formats", []) + if formats is None: + formats = [] + formats = [f for f in formats if f and f != "unknown"] + + return SummaryData( + name=root_data.get("name", "Unnamed Dataset"), + description=root_data.get("description", ""), + total_size_formatted=size_str, + total_entities=root_data.get("evi:totalEntities", 0), + dataset_count=root_data.get("evi:datasetCount", 0), + computation_count=root_data.get("evi:computationCount", 0), + software_count=root_data.get("evi:softwareCount", 0), + formats=formats + ) + + @staticmethod + def _format_size(size_bytes: int) -> str: + """Format bytes to human-readable size.""" + if size_bytes >= 1e12: + return f"{size_bytes / 1e12:.1f} TB" + elif size_bytes >= 1e9: + return f"{size_bytes / 1e9:.1f} GB" + elif size_bytes >= 1e6: + return f"{size_bytes / 1e6:.1f} MB" + elif size_bytes >= 1e3: + return f"{size_bytes / 1e3:.1f} KB" + return f"{size_bytes} B" + + def compute_aiready_score(self, crate: ROCrateV1_2) -> Tuple[AIReadyScoreData, AIReadyScore]: + """Compute AI-Ready score from an RO-Crate. + + Returns: + Tuple of (AIReadyScoreData for visualization, AIReadyScore raw pydantic model) + """ + crate_dict = { + "@context": crate.context, + "@graph": [entity.model_dump(by_alias=True) for entity in crate.metadataGraph] + } + raw_score = score_rocrate(crate_dict) + + categories = [] + total_earned = 0 + total_possible = 0 + + for cat_key, (label, subcriteria) in self.CATEGORY_MAP.items(): + cat_score = getattr(raw_score, cat_key) + earned = sum(1 for sc in subcriteria if getattr(cat_score, sc).has_content) + possible = len(subcriteria) + percentage = (earned / possible * 100) if possible > 0 else 0 + + categories.append(AIReadyCategory( + label=label, + earned=earned, + possible=possible, + percentage=round(percentage, 1), + color=self._get_color(percentage) + )) + + total_earned += earned + total_possible += possible + + total_percentage = (total_earned / total_possible * 100) if total_possible > 0 else 0 + + score_data = AIReadyScoreData( + categories=categories, + total_earned=total_earned, + total_possible=total_possible, + total_percentage=round(total_percentage, 1), + total_color=self._get_color(total_percentage) + ) + + return score_data, raw_score + + def save_aiready_score(self, raw_score: AIReadyScore, output_path: Path) -> None: + """Save the AI-Ready score to a JSON file.""" + score_dict = raw_score.model_dump() + with open(output_path, 'w') as f: + json.dump(score_dict, f, indent=2) + + def generate(self, crate: ROCrateV1_2, output_dir: Optional[Path] = None) -> str: + """Generate the summary section HTML. + + Args: + crate: The RO-Crate to generate summary for + output_dir: Directory to save ai_ready_score.json (optional) + + Returns: + HTML string for the summary section + """ + summary = self.extract_summary_data(crate) + score_data, raw_score = self.compute_aiready_score(crate) + + aiready_json_path = None + if output_dir: + aiready_json_path = output_dir / "ai_ready_score.json" + self.save_aiready_score(raw_score, aiready_json_path) + + desc = summary.description + if len(desc) > 500: + desc = desc[:500].rsplit(" ", 1)[0] + "..." + + formats_str = ", ".join(sorted(summary.formats)[:10]) + if len(summary.formats) > 10: + formats_str += f" (+{len(summary.formats) - 10} more)" + + context = { + 'description': desc, + 'total_size': summary.total_size_formatted, + 'total_entities': f"{summary.total_entities:,}" if summary.total_entities else "N/A", + 'formats': formats_str, + 'dataset_count': f"{summary.dataset_count:,}" if summary.dataset_count else "0", + 'computation_count': f"{summary.computation_count:,}" if summary.computation_count else "0", + 'software_count': f"{summary.software_count:,}" if summary.software_count else "0", + 'aiready_categories': [ + { + 'label': cat.label, + 'earned': cat.earned, + 'possible': cat.possible, + 'percentage': cat.percentage, + 'color': cat.color + } + for cat in score_data.categories + ], + 'aiready_total_percentage': score_data.total_percentage, + 'aiready_total_color': score_data.total_color, + 'aiready_json_filename': "ai_ready_score.json" if output_dir else None + } + + template = self.template_engine.get_template('sections/summary.html') + return template.render(**context) diff --git a/src/fairscape_cli/datasheet_builder/templates/base.html b/src/fairscape_cli/datasheet_builder/templates/base.html index 88f6360..2624323 100644 --- a/src/fairscape_cli/datasheet_builder/templates/base.html +++ b/src/fairscape_cli/datasheet_builder/templates/base.html @@ -452,6 +452,138 @@ } /* END: New styles for sidebar navigation */ + /* START: Executive Summary and AI-Readiness Score styles */ + .executive-summary-section { + background-color: #e9ecef; + padding: 20px; + border-radius: 5px; + margin-bottom: 20px; + border: 1px solid #ced4da; + } + + .executive-summary-section h2 { + font-size: 18px; + color: #2c3e50; + margin: 0 0 12px 0; + padding-bottom: 8px; + border-bottom: 2px solid #2c3e50; + } + + .summary-description { + font-size: 13px; + color: #495057; + margin-bottom: 15px; + } + + .exec-summary-grid { + display: grid; + grid-template-columns: 1fr 1fr; + gap: 15px; + } + + .exec-summary-item { + background-color: #f8f9fa; + padding: 12px 15px; + border-radius: 4px; + border: 1px solid #e9ecef; + } + + .exec-summary-label { + font-weight: bold; + color: #2c3e50; + display: block; + margin-bottom: 10px; + padding-bottom: 6px; + border-bottom: 1px solid #dee2e6; + font-size: 13px; + } + + .exec-summary-item .stat-item { + display: block; + margin-bottom: 6px; + font-size: 12px; + } + + .stat-sub-label { + color: #495057; + } + + .stat-sub-value { + color: #333; + } + + .formats-list { + word-break: break-word; + } + + /* AI-Ready Score Bars */ + .aiready-bars { + margin-top: 2px; + } + + .aiready-bar-row { + display: flex; + align-items: center; + margin-bottom: 4px; + } + + .aiready-bar-label { + width: 90px; + font-size: 10px; + font-weight: 600; + color: #444; + } + + .aiready-bar-track { + flex: 1; + height: 12px; + background: #e0e0e0; + border-radius: 2px; + overflow: hidden; + } + + .aiready-bar-fill { + height: 100%; + border-radius: 2px; + } + + .aiready-bar-score { + width: 30px; + font-size: 9px; + text-align: right; + color: #666; + margin-left: 6px; + } + + .aiready-overall-row { + margin-top: 6px; + padding-top: 6px; + border-top: 1px solid #ddd; + } + + .aiready-overall-row .aiready-bar-label { + font-weight: bold; + color: #2c3e50; + } + + .aiready-overall-row .aiready-bar-score { + font-weight: bold; + color: #333; + } + + .view-details-link { + font-size: 11px; + font-weight: normal; + color: #2c3e50; + text-decoration: none; + margin-left: 5px; + } + + .view-details-link:hover { + text-decoration: underline; + } + /* END: Executive Summary and AI-Readiness Score styles */ + @media print { body { background-color: white; @@ -481,7 +613,8 @@ @media (max-width: 768px) { .compact-grid, - .regulatory-grid { + .regulatory-grid, + .exec-summary-grid { grid-template-columns: 1fr; } @@ -503,6 +636,7 @@