diff --git a/src/fairscape_cli/commands/track.py b/src/fairscape_cli/commands/track.py index ddafd20..8337f54 100644 --- a/src/fairscape_cli/commands/track.py +++ b/src/fairscape_cli/commands/track.py @@ -1,8 +1,9 @@ import click import pathlib import os +import sys import runpy -from typing import List +from typing import List, Tuple from fairscape_cli.tracking.io_capture import IOCapture from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker @@ -10,14 +11,21 @@ from fairscape_cli.tracking.metadata_generator import create_metadata_generator -@click.command('track') +@click.command('track', context_settings=dict( + ignore_unknown_options=True, + allow_extra_args=True, +)) @click.argument('script-path', type=click.Path(exists=True, path_type=pathlib.Path)) @click.option('--rocrate-path', type=click.Path(path_type=pathlib.Path), default=None, help='Path to RO-Crate directory (default: current directory)') @click.option('--author', type=str, default="Unknown", help='Author name (default: from RO-Crate or "Unknown")') @click.option('--keywords', multiple=True, default=["computation"], help='Keywords for metadata (default: from RO-Crate or ["computation"])') @click.option('--input', 'manual_inputs', multiple=True, help='Manual input files to track') @click.option('--no-llm', is_flag=True, default=False, help='Disable LLM-based description generation') +@click.option('--max-images', type=int, default=5, help='Maximum number of images to send to LLM for analysis (default: 5)') @click.option('--execution-name', type=str, default=None, help='Name for this execution (default: script filename)') +@click.option('--reference-crate', 'reference_crates', multiple=True, type=click.Path(exists=True, path_type=pathlib.Path), help='Reference RO-Crate(s) to look up existing ARKs for input files') +@click.option('--start-clean', is_flag=True, default=False, help='Clear existing @graph entries (except root) before tracking') +@click.argument('script_args', nargs=-1, type=click.UNPROCESSED) @click.pass_context def track( ctx, @@ -27,27 +35,37 @@ def track( keywords: List[str], manual_inputs: List[str], no_llm: bool, - execution_name: str + max_images: int, + execution_name: str, + reference_crates: Tuple[pathlib.Path, ...], + start_clean: bool, + script_args: Tuple[str, ...] ): """Track execution of a Python script and generate provenance metadata. - + Executes SCRIPT_PATH while capturing file I/O operations, then generates RO-Crate metadata documenting the computation, software, input datasets, and output datasets. - + + Script arguments can be passed after the script path and options. + Examples: - + fairscape-cli track analysis.py - + fairscape-cli track analysis.py --author "Jane Doe" --keywords ml analysis - + fairscape-cli track analysis.py --rocrate-path ./my-crate --input config.json - + fairscape-cli track analysis.py --no-llm --author "John Smith" + + fairscape-cli track script.py -- ./output_dir --inputdir ./input_dir + + fairscape-cli track script.py --reference-crate ./input_data_crate -- ./output --inputdir ./input_data_crate/data """ - rocrate_path = rocrate_path or pathlib.Path.cwd() - + rocrate_path = (rocrate_path or pathlib.Path.cwd()).resolve() + if not script_path.exists(): click.echo(f"ERROR: Script file not found: {script_path}", err=True) ctx.exit(code=1) @@ -60,16 +78,26 @@ def track( ctx.exit(code=1) tracker_config = TrackerConfig() - + original_cwd = pathlib.Path.cwd() - script_dir = script_path.parent.resolve() - + + script_path_resolved = script_path.resolve() + + resolved_args = [] + for arg in script_args: + arg_path = pathlib.Path(arg) + if arg_path.exists() or (not arg.startswith('-') and ('/' in arg or '\\' in arg)): + resolved_args.append(str((original_cwd / arg).resolve())) + else: + resolved_args.append(arg) + + original_argv = sys.argv + sys.argv = [str(script_path_resolved)] + resolved_args + try: - os.chdir(script_dir) - with IOCapture(config=tracker_config) as capture: try: - runpy.run_path(str(script_path), run_name='__main__') + runpy.run_path(str(script_path_resolved), run_name='__main__') except SystemExit as e: if e.code != 0: click.echo(f"WARNING: Script exited with code {e.code}", err=True) @@ -77,23 +105,47 @@ def track( click.echo(f"ERROR: Script execution failed: {exc}", err=True) ctx.exit(code=1) finally: - os.chdir(original_cwd) - + sys.argv = original_argv + if not capture.inputs and not capture.outputs and not manual_inputs: click.echo("WARNING: No file I/O detected in script execution", err=True) click.echo("No metadata generated.", err=True) return - + use_llm = not no_llm and os.environ.get("GEMINI_API_KEY") - + + # Log LLM decision + if no_llm: + click.echo("INFO: LLM disabled via --no-llm flag", err=True) + elif not os.environ.get("GEMINI_API_KEY"): + click.echo("INFO: LLM disabled - GEMINI_API_KEY environment variable not set", err=True) + click.echo("INFO: Set GEMINI_API_KEY to enable AI-generated descriptions", err=True) + else: + click.echo("INFO: LLM enabled - will generate AI descriptions", err=True) + metadata_generator = None if use_llm: from datetime import datetime try: + # Test if google.generativeai is available + try: + import google.generativeai as genai + click.echo("INFO: google-generativeai package found", err=True) + except ImportError as imp_exc: + click.echo(f"WARNING: google-generativeai package not installed: {imp_exc}", err=True) + click.echo("INFO: Install with: pip install google-generativeai", err=True) + click.echo("Falling back to simple descriptions", err=True) + raise + metadata_generator = create_metadata_generator( provider="gemini", - timestamp=datetime.now().strftime("%Y%m%d_%H%M%S") + timestamp=datetime.now().strftime("%Y%m%d_%H%M%S"), + max_images=max_images ) + click.echo("INFO: LLM metadata generator initialized successfully", err=True) + except ImportError: + # Already logged above + pass except Exception as exc: click.echo(f"WARNING: Could not initialize LLM metadata generator: {exc}", err=True) click.echo("Falling back to simple descriptions", err=True) @@ -103,7 +155,9 @@ def track( author=author, keywords=list(keywords), manual_inputs=list(manual_inputs), - use_llm=use_llm + use_llm=use_llm, + reference_crates=list(reference_crates), + start_clean=start_clean ) try: diff --git a/src/fairscape_cli/models/utils.py b/src/fairscape_cli/models/utils.py index dd5c3a1..4368ca4 100644 --- a/src/fairscape_cli/models/utils.py +++ b/src/fairscape_cli/models/utils.py @@ -22,11 +22,17 @@ def setRelativeFilepath(cratePath, filePath): return filePath if 'ro-crate-metadata.json' in str(cratePath): - rocratePath = pathlib.Path(cratePath).parent.absolute() + rocratePath = pathlib.Path(cratePath).parent.resolve() else: - rocratePath = pathlib.Path(cratePath).absolute() - - datasetPath = pathlib.Path(filePath).absolute() + rocratePath = pathlib.Path(cratePath).resolve() + + # If filepath is relative, resolve it relative to the crate path, not cwd + filePathObj = pathlib.Path(filePath) + if not filePathObj.is_absolute(): + datasetPath = (rocratePath / filePath).resolve() + else: + datasetPath = filePathObj.resolve() + relativePath = datasetPath.relative_to(rocratePath) return f"file:///{str(relativePath)}" diff --git a/src/fairscape_cli/tracking/config.py b/src/fairscape_cli/tracking/config.py index 0feb2d4..aebc505 100644 --- a/src/fairscape_cli/tracking/config.py +++ b/src/fairscape_cli/tracking/config.py @@ -9,6 +9,7 @@ class TrackerConfig: track_pathlib: bool = True track_pandas: bool = True track_numpy: bool = True + track_matplotlib: bool = True excluded_patterns: List[str] = field(default_factory=lambda: [ '.matplotlib', '.ipython', @@ -44,3 +45,5 @@ class ProvenanceConfig: keywords: List[str] = field(default_factory=lambda: ["jupyter", "computation"]) manual_inputs: List[str] = field(default_factory=list) use_llm: bool = False + reference_crates: List[Path] = field(default_factory=list) + start_clean: bool = False diff --git a/src/fairscape_cli/tracking/io_capture.py b/src/fairscape_cli/tracking/io_capture.py index f02c440..d1a0f98 100644 --- a/src/fairscape_cli/tracking/io_capture.py +++ b/src/fairscape_cli/tracking/io_capture.py @@ -179,55 +179,87 @@ def patch_numpy(self): """Patch numpy methods to track file I/O.""" if not self.config.track_numpy: return - + original_load = np.load original_save = np.save original_loadtxt = np.loadtxt original_savetxt = np.savetxt - + self.original_functions['np.load'] = original_load self.original_functions['np.save'] = original_save self.original_functions['np.loadtxt'] = original_loadtxt self.original_functions['np.savetxt'] = original_savetxt - + capture = self - + def tracked_load(file, *args, **kwargs): if capture._should_track(file): capture.inputs.add(capture._normalize_path(file)) return original_load(file, *args, **kwargs) - + def tracked_save(file, arr, *args, **kwargs): if capture._should_track(file): capture.outputs.add(capture._normalize_path(file)) return original_save(file, arr, *args, **kwargs) - + def tracked_loadtxt(fname, *args, **kwargs): if capture._should_track(fname): capture.inputs.add(capture._normalize_path(fname)) return original_loadtxt(fname, *args, **kwargs) - + def tracked_savetxt(fname, X, *args, **kwargs): if capture._should_track(fname): capture.outputs.add(capture._normalize_path(fname)) return original_savetxt(fname, X, *args, **kwargs) - + np.load = tracked_load np.save = tracked_save np.loadtxt = tracked_loadtxt np.savetxt = tracked_savetxt + + def patch_matplotlib(self): + """Patch matplotlib methods to track file I/O.""" + if not self.config.track_matplotlib: + return + + try: + import matplotlib.pyplot as plt + from matplotlib.figure import Figure + except ImportError: + return + + original_plt_savefig = plt.savefig + original_figure_savefig = Figure.savefig + + self.original_functions['plt.savefig'] = original_plt_savefig + self.original_functions['Figure.savefig'] = original_figure_savefig + + capture = self + + def tracked_plt_savefig(fname, *args, **kwargs): + if capture._should_track(fname): + capture.outputs.add(capture._normalize_path(fname)) + return original_plt_savefig(fname, *args, **kwargs) + + def tracked_figure_savefig(self, fname, *args, **kwargs): + if capture._should_track(fname): + capture.outputs.add(capture._normalize_path(fname)) + return original_figure_savefig(self, fname, *args, **kwargs) + + plt.savefig = tracked_plt_savefig + Figure.savefig = tracked_figure_savefig def restore_all(self): """Restore all original functions.""" builtins.open = self.original_functions.get('builtins.open', builtins.open) - + if 'pathlib.Path.open' in self.original_functions: pathlib.Path.open = self.original_functions['pathlib.Path.open'] pathlib.Path.read_text = self.original_functions['pathlib.Path.read_text'] pathlib.Path.read_bytes = self.original_functions['pathlib.Path.read_bytes'] pathlib.Path.write_text = self.original_functions['pathlib.Path.write_text'] pathlib.Path.write_bytes = self.original_functions['pathlib.Path.write_bytes'] - + if 'pd.read_csv' in self.original_functions: pd.read_csv = self.original_functions['pd.read_csv'] pd.read_excel = self.original_functions['pd.read_excel'] @@ -237,18 +269,28 @@ def restore_all(self): pd.DataFrame.to_excel = self.original_functions['pd.DataFrame.to_excel'] pd.DataFrame.to_parquet = self.original_functions['pd.DataFrame.to_parquet'] pd.DataFrame.to_json = self.original_functions['pd.DataFrame.to_json'] - + if 'np.load' in self.original_functions: np.load = self.original_functions['np.load'] np.save = self.original_functions['np.save'] np.loadtxt = self.original_functions['np.loadtxt'] np.savetxt = self.original_functions['np.savetxt'] - + + if 'plt.savefig' in self.original_functions: + try: + import matplotlib.pyplot as plt + from matplotlib.figure import Figure + plt.savefig = self.original_functions['plt.savefig'] + Figure.savefig = self.original_functions['Figure.savefig'] + except ImportError: + pass + def __enter__(self): self.patch_open() self.patch_pathlib() self.patch_pandas() self.patch_numpy() + self.patch_matplotlib() return self def __exit__(self, exc_type, exc_val, exc_tb): diff --git a/src/fairscape_cli/tracking/metadata_generator.py b/src/fairscape_cli/tracking/metadata_generator.py index f2bf912..96dab2c 100644 --- a/src/fairscape_cli/tracking/metadata_generator.py +++ b/src/fairscape_cli/tracking/metadata_generator.py @@ -11,10 +11,10 @@ INPUT FORMAT: - Software code that was executed -- Input datasets with samples (first 5 rows) -- Output datasets with samples (first 5 rows) +- Input datasets/files (may include tabular data samples or image files) +- Output datasets/files (may include tabular data samples or image files) -OUTPUT FORMAT: JSON object with these keys: +OUTPUT FORMAT: Return ONLY a valid JSON object with these keys: {{ "software_description": "What this code does technically", "computation_description": "What this computation accomplishes", @@ -30,21 +30,23 @@ - Software description: 1-2 sentences, focus on operations performed - Computation description: 1-2 sentences, focus on scientific/analytical goal - Dataset descriptions: 1 sentence each, describe content type and role in workflow +- For images: Generate figure legend-style descriptions that describe what is shown in the image scientifically - Be technical but clear, assume scientific audience -- No markdown formatting, just plain JSON +- CRITICAL: Return ONLY valid JSON, no markdown code blocks, no explanations, no trailing commas +- Ensure all string values properly escape quotes and special characters SOFTWARE CODE: ```python {code} ``` -INPUT DATASETS: -{input_samples} +INPUT FILES: +{input_info} -OUTPUT DATASETS: -{output_samples} +OUTPUT FILES: +{output_info} -Generate the JSON now:""" +Return the JSON object now (no code blocks, no additional text):""" class MetadataGenerator(ABC): @@ -63,27 +65,35 @@ def generate_descriptions( class GeminiMetadataGenerator(MetadataGenerator): """Generate metadata using Google Gemini.""" - - def __init__(self, api_key: Optional[str] = None, temperature: float = 0.2, max_tokens: int = 2048): + + def __init__(self, api_key: Optional[str] = None, temperature: float = 0.2, max_tokens: int = 2048, max_images: int = 5): self.api_key = api_key or os.environ.get("GEMINI_API_KEY") self.temperature = temperature self.max_tokens = max_tokens - + self.max_images = max_images + if not self.api_key: raise ValueError("GEMINI_API_KEY not found in environment or provided") - def _build_prompt(self, code: str, input_samples: str, output_samples: str) -> str: + def _is_image_file(self, filepath: str) -> bool: + """Check if file is an image.""" + from pathlib import Path + suffix = Path(filepath).suffix.lower() + return suffix in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.tif', '.webp'] + + def _build_prompt(self, code: str, input_info: str, output_info: str) -> str: """Build the prompt for the LLM.""" return DESCRIPTION_PROMPT_TEMPLATE.format( code=code, - input_samples=input_samples, - output_samples=output_samples + input_info=input_info, + output_info=output_info ) - + def _parse_llm_response(self, response_text: str) -> Dict[str, Any]: """Parse JSON from LLM response, handling various formats.""" response_text = response_text.strip() - + + # Try to extract JSON from markdown code blocks json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL) if json_match: response_text = json_match.group(1) @@ -95,30 +105,140 @@ def _parse_llm_response(self, response_text: str) -> Dict[str, Any]: json_search = re.search(r'\{.*\}', response_text, re.DOTALL) if json_search: response_text = json_search.group(0) - - return json.loads(response_text) - + + # Clean common JSON formatting issues + response_text = response_text.strip() + + # Remove trailing commas before closing braces/brackets + response_text = re.sub(r',(\s*[}\]])', r'\1', response_text) + + # Try to parse + try: + return json.loads(response_text) + except json.JSONDecodeError as e: + # If parsing fails, try to provide helpful debug info + print(f"DEBUG: Failed to parse LLM response. Error: {e}", file=__import__('sys').stderr) + print(f"DEBUG: Response text (first 500 chars):\n{response_text[:500]}", file=__import__('sys').stderr) + print(f"DEBUG: Response text (last 500 chars):\n{response_text[-500:]}", file=__import__('sys').stderr) + + # Try one more aggressive cleanup: find the largest valid JSON object + # by matching braces more carefully + brace_count = 0 + start_idx = response_text.find('{') + if start_idx == -1: + raise ValueError("No JSON object found in LLM response") from e + + for i in range(start_idx, len(response_text)): + if response_text[i] == '{': + brace_count += 1 + elif response_text[i] == '}': + brace_count -= 1 + if brace_count == 0: + # Found matching closing brace + candidate = response_text[start_idx:i+1] + # Clean and try again + candidate = re.sub(r',(\s*[}\]])', r'\1', candidate) + try: + return json.loads(candidate) + except json.JSONDecodeError: + pass + + # If all else fails, re-raise original error + raise + def generate_descriptions( - self, - code: str, - input_samples: str, - output_samples: str + self, + code: str, + input_files: Dict[str, str], + output_files: Dict[str, str] ) -> Optional[Dict[str, Any]]: - """Generate descriptions using Gemini.""" + """Generate descriptions using Gemini with support for images. + + Args: + code: The source code + input_files: Dict mapping filename to filepath + output_files: Dict mapping filename to filepath + """ import google.generativeai as genai - + from pathlib import Path + from PIL import Image + genai.configure(api_key=self.api_key) model = genai.GenerativeModel('gemini-2.5-flash') - - prompt = self._build_prompt(code, input_samples, output_samples) + + # Separate image files from other files + input_images = {k: v for k, v in input_files.items() if self._is_image_file(v)} + output_images = {k: v for k, v in output_files.items() if self._is_image_file(v)} + + # Limit total images to avoid Gemini safety filters + # Reserve 2 for inputs, rest for outputs (CAM visualizations are more important) + total_image_budget = self.max_images + max_input_images = min(2, len(input_images)) + max_output_images = min(total_image_budget - max_input_images, len(output_images)) + + input_images = dict(list(input_images.items())[:max_input_images]) + output_images = dict(list(output_images.items())[:max_output_images]) + + # Build text descriptions for files + input_info_parts = [] + for filename in input_files.keys(): + if filename in input_images: + input_info_parts.append(f"- {filename} (image - see attached)") + else: + input_info_parts.append(f"- {filename}") + + output_info_parts = [] + for filename in output_files.keys(): + if filename in output_images: + output_info_parts.append(f"- {filename} (image - see attached)") + else: + output_info_parts.append(f"- {filename}") + + input_info = "\n".join(input_info_parts) if input_info_parts else "None" + output_info = "\n".join(output_info_parts) if output_info_parts else "None" + + prompt = self._build_prompt(code, input_info, output_info) + + # Build content list with prompt and images + content = [prompt] + + # Add input images + for filename, filepath in input_images.items(): + try: + img = Image.open(filepath) + content.append(img) + content.append(f"Input image: {filename}") + except Exception as e: + print(f"WARNING: Could not load input image {filename}: {e}", file=__import__('sys').stderr) + + # Add output images + for filename, filepath in output_images.items(): + try: + img = Image.open(filepath) + content.append(img) + content.append(f"Output image: {filename}") + except Exception as e: + print(f"WARNING: Could not load output image {filename}: {e}", file=__import__('sys').stderr) + response = model.generate_content( - prompt, + content, generation_config=genai.types.GenerationConfig( temperature=self.temperature, max_output_tokens=self.max_tokens, + response_mime_type="application/json", ) ) - + + # Check if response was blocked by safety filters + if not response.candidates or not response.candidates[0].content.parts: + finish_reason = response.candidates[0].finish_reason if response.candidates else None + raise ValueError( + f"Gemini blocked the response (finish_reason={finish_reason}). " + f"This usually means too many images were sent. " + f"Sent {len(input_images)} input images + {len(output_images)} output images. " + f"Try reducing max_images parameter (currently {self.max_images})." + ) + return self._parse_llm_response(response.text) @@ -168,7 +288,7 @@ def create_metadata_generator( if provider == "gemini": try: - gemini_kwargs = {k: v for k, v in kwargs.items() if k in ['temperature', 'max_tokens']} + gemini_kwargs = {k: v for k, v in kwargs.items() if k in ['temperature', 'max_tokens', 'max_images']} return GeminiMetadataGenerator(api_key=api_key, **gemini_kwargs) except ValueError: print("WARNING: GEMINI_API_KEY not found, using fallback descriptions") diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py index f27aa88..ddf24c7 100644 --- a/src/fairscape_cli/tracking/provenance_tracker.py +++ b/src/fairscape_cli/tracking/provenance_tracker.py @@ -1,6 +1,8 @@ from pathlib import Path from datetime import datetime from typing import List, Dict, Set, Optional +import json +import sys from fairscape_cli.models.rocrate import ReadROCrateMetadata, AppendCrate from fairscape_cli.models.dataset import GenerateDataset, Dataset @@ -17,7 +19,7 @@ class ProvenanceTracker: - + def __init__( self, config: ProvenanceConfig, @@ -28,9 +30,14 @@ def __init__( self.filepath_to_guid: Dict[str, str] = {} self.existing_guids: Set[str] = set() self.crate_metadata = None - + + self.reference_entities: Dict[str, tuple] = {} + self._ensure_crate_exists() + if self.config.start_clean: + self._clear_graph() self._load_crate_context() + self._load_reference_crates() def _ensure_crate_exists(self): metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json' @@ -43,7 +50,7 @@ def _ensure_crate_exists(self): GenerateROCrate( path=self.config.rocrate_path, - guid=None, + guid=None, name=placeholder_name, description=placeholder_description, author=self.config.author if self.config.author != "Unknown" else "Researcher", @@ -53,7 +60,39 @@ def _ensure_crate_exists(self): license="https://creativecommons.org/licenses/by/4.0/", isPartOf=[] ) - + + def _clear_graph(self): + """Clear existing @graph entries except the metadata descriptor and root dataset.""" + metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json' + + if not metadata_path.exists(): + return + + try: + with metadata_path.open('r') as f: + crate_data = json.load(f) + + graph = crate_data.get('@graph', []) + if len(graph) <= 2: + # Only metadata descriptor and root dataset, nothing to clear + return + + # Keep only first two elements: metadata descriptor and root dataset + crate_data['@graph'] = graph[:2] + + # Clear hasPart in root dataset + if len(crate_data['@graph']) > 1: + root_dataset = crate_data['@graph'][1] + if 'hasPart' in root_dataset: + root_dataset['hasPart'] = [] + + with metadata_path.open('w') as f: + json.dump(crate_data, f, indent=2) + + print(f"Cleared existing @graph entries from {metadata_path}") + + except Exception as e: + print(f"WARNING: Could not clear @graph: {e}") def _load_crate_context(self): try: @@ -86,7 +125,32 @@ def _load_crate_context(self): except Exception as e: raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}") - + + def _load_reference_crates(self): + """Load reference RO-Crates to look up existing ARKs for input files.""" + for ref_crate_path in self.config.reference_crates: + try: + ref_metadata = ReadROCrateMetadata(ref_crate_path) + indexed_count = 0 + + for entity in ref_metadata['@graph']: + entity_guid = getattr(entity, 'guid', None) + if not entity_guid: + continue + + content_url = getattr(entity, 'contentUrl', None) + if content_url and content_url.startswith('file://'): + relative_path = content_url.replace('file:///', '').lstrip('/') + filepath_full = (ref_crate_path / relative_path).resolve() + + self.reference_entities[str(filepath_full)] = (entity_guid, entity) + indexed_count += 1 + + print(f"Loaded reference crate: {ref_crate_path} ({indexed_count} entities)") + + except Exception as e: + print(f"WARNING: Could not load reference crate {ref_crate_path}: {e}") + def _resolve_manual_inputs(self) -> Set[str]: manual_input_paths = set() for manual_input in self.config.manual_inputs: @@ -101,35 +165,57 @@ def _resolve_manual_inputs(self) -> Set[str]: def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]: all_input_files = set(io_capture.inputs) all_input_files.update(self._resolve_manual_inputs()) - + + # Build set of output files to detect intermediate files + output_files = {str(Path(f).resolve()) for f in io_capture.outputs} + input_datasets = [] reused_count = 0 - + for input_file in all_input_files: input_path = Path(input_file) - + if not input_path.exists(): print(f"WARNING: Input file does not exist: {input_file}") continue - + normalized_path = input_path.resolve() - + + # Skip files that were also written during this execution (intermediate files) + if str(normalized_path) in output_files: + print(f"INFO: Skipping intermediate file (written then read): {input_path.name}") + continue + + # Check if file exists in current crate if str(normalized_path) in self.filepath_to_guid: existing_guid = self.filepath_to_guid[str(normalized_path)] - print(f"Reusing existing dataset: {input_path.name} ({existing_guid})") - + existing_dataset = next( (e for e in self.crate_metadata['@graph'] if getattr(e, 'guid', None) == existing_guid), None ) if existing_dataset: - dataset_obj =Dataset.model_validate(existing_dataset) + dataset_obj = Dataset.model_validate(existing_dataset) input_datasets.append(dataset_obj) reused_count += 1 continue - - rel_path = input_path.relative_to(self.config.rocrate_path) if input_path.is_relative_to(self.config.rocrate_path) else input_path - + + # Check if file exists in reference crates + if str(normalized_path) in self.reference_entities: + ref_guid, ref_entity = self.reference_entities[str(normalized_path)] + + ref_entity._is_reference = True + input_datasets.append(ref_entity) + reused_count += 1 + continue + + # File not in any crate - create new dataset if it's within rocrate_path + if not input_path.is_relative_to(self.config.rocrate_path): + # Skip files outside the crate path silently (e.g., /dev/null) + continue + + rel_path = input_path.relative_to(self.config.rocrate_path) + dataset_metadata = GenerateDataset( name=input_path.name, author=self.config.author, @@ -147,12 +233,16 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]: def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]: output_datasets = [] - + for output_file in io_capture.outputs: - output_path = Path(output_file) - - rel_path = output_path.relative_to(self.config.rocrate_path) if output_path.is_relative_to(self.config.rocrate_path) else output_path - + output_path = Path(output_file).resolve() + + # Skip files outside the crate path (e.g., /dev/null) + if not output_path.is_relative_to(self.config.rocrate_path): + continue + + rel_path = output_path.relative_to(self.config.rocrate_path) + dataset_metadata = GenerateDataset( name=output_path.name, author=self.config.author, @@ -166,9 +256,9 @@ def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]: cratePath=self.config.rocrate_path ) output_datasets.append(dataset_metadata) - + return output_datasets - + def _enhance_with_llm( self, code: str, @@ -177,40 +267,52 @@ def _enhance_with_llm( ) -> Optional[Dict]: if not self.metadata_generator: + print("INFO: No metadata generator available - skipping LLM enhancement", file=sys.stderr) return None - - all_input_files = set() + + print(f"INFO: Starting LLM enhancement with {len(input_datasets)} inputs and {len(output_datasets)} outputs", file=sys.stderr) + + # Collect file paths with their filenames + input_files = {} # filename -> filepath for ds in input_datasets: content_url = getattr(ds, 'contentUrl', None) if content_url: url = content_url if isinstance(content_url, str) else content_url[0] filepath = url.replace('file:///', '').replace('file:', '') full_path = str((self.config.rocrate_path / filepath).resolve()) - all_input_files.add(full_path) + filename = Path(full_path).name + input_files[filename] = full_path - all_output_files = set() + output_files = {} # filename -> filepath for ds in output_datasets: content_url = getattr(ds, 'contentUrl', None) if content_url: url = content_url if isinstance(content_url, str) else content_url[0] - filepath = url.replace('file://', '').replace('file:', '') + filepath = url.replace('file:///', '').replace('file:', '') full_path = str((self.config.rocrate_path / filepath).resolve()) - all_output_files.add(full_path) - - input_samples = collect_dataset_samples(all_input_files) - output_samples = collect_dataset_samples(all_output_files) - - if not input_samples and not output_samples: + filename = Path(full_path).name + output_files[filename] = full_path + + print(f"INFO: Collected {len(input_files)} input files and {len(output_files)} output files", file=sys.stderr) + + # Call LLM even if there are no files - it can still describe the code + if not input_files and not output_files: + print("INFO: No files found, but calling LLM to describe code anyway", file=sys.stderr) + + print("INFO: Calling LLM to generate descriptions...", file=sys.stderr) + try: + result = self.metadata_generator.generate_descriptions( + code, + input_files, + output_files + ) + print("INFO: LLM successfully generated descriptions", file=sys.stderr) + return result + except Exception as exc: + print(f"WARNING: LLM description generation failed: {exc}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) return None - - input_samples_str = format_samples_for_prompt(input_samples) - output_samples_str = format_samples_for_prompt(output_samples) - - return self.metadata_generator.generate_descriptions( - code, - input_samples_str, - output_samples_str - ) def _apply_llm_descriptions( self, @@ -291,14 +393,14 @@ def track_execution( ) -> TrackingResult: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") execution_name = execution_name or f"cell_{timestamp}" - + if not io_capture.inputs and not io_capture.outputs and not self.config.manual_inputs: print("No file I/O detected in execution") raise ValueError("No file I/O detected") - + input_datasets, reused_count = self._resolve_inputs(io_capture) output_datasets = self._resolve_outputs(io_capture) - + llm_descriptions = self._enhance_with_llm(code, input_datasets, output_datasets) software_description, computation_description = self._apply_llm_descriptions( @@ -318,8 +420,12 @@ def track_execution( output_datasets ) - new_datasets = [ds for ds in input_datasets if ds.guid not in self.existing_guids] - + # Filter out datasets that already exist in current crate OR are references to external crates + new_datasets = [ + ds for ds in input_datasets + if ds.guid not in self.existing_guids and not getattr(ds, '_is_reference', False) + ] + elements = [software] + new_datasets + output_datasets + [computation] AppendCrate(cratePath=self.config.rocrate_path, elements=elements)