diff --git a/src/fairscape_cli/commands/track.py b/src/fairscape_cli/commands/track.py
index ddafd20..8337f54 100644
--- a/src/fairscape_cli/commands/track.py
+++ b/src/fairscape_cli/commands/track.py
@@ -1,8 +1,9 @@
 import click
 import pathlib
 import os
+import sys
 import runpy
-from typing import List
+from typing import List, Tuple
 
 from fairscape_cli.tracking.io_capture import IOCapture
 from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
@@ -10,14 +11,21 @@
 from fairscape_cli.tracking.metadata_generator import create_metadata_generator
 
 
-@click.command('track')
+@click.command('track', context_settings=dict(
+    ignore_unknown_options=True,
+    allow_extra_args=True,
+))
 @click.argument('script-path', type=click.Path(exists=True, path_type=pathlib.Path))
 @click.option('--rocrate-path', type=click.Path(path_type=pathlib.Path), default=None, help='Path to RO-Crate directory (default: current directory)')
 @click.option('--author', type=str, default="Unknown", help='Author name (default: from RO-Crate or "Unknown")')
 @click.option('--keywords', multiple=True, default=["computation"], help='Keywords for metadata (default: from RO-Crate or ["computation"])')
 @click.option('--input', 'manual_inputs', multiple=True, help='Manual input files to track')
 @click.option('--no-llm', is_flag=True, default=False, help='Disable LLM-based description generation')
+@click.option('--max-images', type=int, default=5, help='Maximum number of images to send to LLM for analysis (default: 5)')
 @click.option('--execution-name', type=str, default=None, help='Name for this execution (default: script filename)')
+@click.option('--reference-crate', 'reference_crates', multiple=True, type=click.Path(exists=True, path_type=pathlib.Path), help='Reference RO-Crate(s) to look up existing ARKs for input files')
+@click.option('--start-clean', is_flag=True, default=False, help='Clear existing @graph entries (except root) before tracking')
+@click.argument('script_args', nargs=-1, type=click.UNPROCESSED)
 @click.pass_context
 def track(
     ctx,
@@ -27,27 +35,37 @@ def track(
     keywords: List[str],
     manual_inputs: List[str],
     no_llm: bool,
-    execution_name: str
+    max_images: int,
+    execution_name: str,
+    reference_crates: Tuple[pathlib.Path, ...],
+    start_clean: bool,
+    script_args: Tuple[str, ...]
 ):
     """Track execution of a Python script and generate provenance metadata.
-    
+
     Executes SCRIPT_PATH while capturing file I/O operations, then generates
     RO-Crate metadata documenting the computation, software, input datasets,
     and output datasets.
-    
+
+    Script arguments can be passed after the script path and options.
+
     Examples:
-    
+
         fairscape-cli track analysis.py
-        
+
         fairscape-cli track analysis.py --author "Jane Doe" --keywords ml analysis
-        
+
         fairscape-cli track analysis.py --rocrate-path ./my-crate --input config.json
-        
+
         fairscape-cli track analysis.py --no-llm --author "John Smith"
+
+        fairscape-cli track script.py -- ./output_dir --inputdir ./input_dir
+
+        fairscape-cli track script.py --reference-crate ./input_data_crate -- ./output --inputdir ./input_data_crate/data
     """
     
-    rocrate_path = rocrate_path or pathlib.Path.cwd()
-    
+    rocrate_path = (rocrate_path or pathlib.Path.cwd()).resolve()
+
     if not script_path.exists():
         click.echo(f"ERROR: Script file not found: {script_path}", err=True)
         ctx.exit(code=1)
@@ -60,16 +78,26 @@ def track(
         ctx.exit(code=1)
     
     tracker_config = TrackerConfig()
-    
+
     original_cwd = pathlib.Path.cwd()
-    script_dir = script_path.parent.resolve()
-    
+
+    script_path_resolved = script_path.resolve()
+
+    resolved_args = []
+    for arg in script_args:
+        arg_path = pathlib.Path(arg)
+        if arg_path.exists() or (not arg.startswith('-') and ('/' in arg or '\\' in arg)):
+            resolved_args.append(str((original_cwd / arg).resolve()))
+        else:
+            resolved_args.append(arg)
+
+    original_argv = sys.argv
+    sys.argv = [str(script_path_resolved)] + resolved_args
+
     try:
-        os.chdir(script_dir)
-        
         with IOCapture(config=tracker_config) as capture:
             try:
-                runpy.run_path(str(script_path), run_name='__main__')
+                runpy.run_path(str(script_path_resolved), run_name='__main__')
             except SystemExit as e:
                 if e.code != 0:
                     click.echo(f"WARNING: Script exited with code {e.code}", err=True)
@@ -77,23 +105,47 @@ def track(
                 click.echo(f"ERROR: Script execution failed: {exc}", err=True)
                 ctx.exit(code=1)
     finally:
-        os.chdir(original_cwd)
-    
+        sys.argv = original_argv
+
     if not capture.inputs and not capture.outputs and not manual_inputs:
         click.echo("WARNING: No file I/O detected in script execution", err=True)
         click.echo("No metadata generated.", err=True)
         return
-    
+
     use_llm = not no_llm and os.environ.get("GEMINI_API_KEY")
-    
+
+    # Log LLM decision
+    if no_llm:
+        click.echo("INFO: LLM disabled via --no-llm flag", err=True)
+    elif not os.environ.get("GEMINI_API_KEY"):
+        click.echo("INFO: LLM disabled - GEMINI_API_KEY environment variable not set", err=True)
+        click.echo("INFO: Set GEMINI_API_KEY to enable AI-generated descriptions", err=True)
+    else:
+        click.echo("INFO: LLM enabled - will generate AI descriptions", err=True)
+
     metadata_generator = None
     if use_llm:
         from datetime import datetime
         try:
+            # Test if google.generativeai is available
+            try:
+                import google.generativeai as genai
+                click.echo("INFO: google-generativeai package found", err=True)
+            except ImportError as imp_exc:
+                click.echo(f"WARNING: google-generativeai package not installed: {imp_exc}", err=True)
+                click.echo("INFO: Install with: pip install google-generativeai", err=True)
+                click.echo("Falling back to simple descriptions", err=True)
+                raise
+
             metadata_generator = create_metadata_generator(
                 provider="gemini",
-                timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
+                timestamp=datetime.now().strftime("%Y%m%d_%H%M%S"),
+                max_images=max_images
             )
+            click.echo("INFO: LLM metadata generator initialized successfully", err=True)
+        except ImportError:
+            # Already logged above
+            pass
         except Exception as exc:
             click.echo(f"WARNING: Could not initialize LLM metadata generator: {exc}", err=True)
             click.echo("Falling back to simple descriptions", err=True)
@@ -103,7 +155,9 @@ def track(
         author=author,
         keywords=list(keywords),
         manual_inputs=list(manual_inputs),
-        use_llm=use_llm
+        use_llm=use_llm,
+        reference_crates=list(reference_crates),
+        start_clean=start_clean
     )
     
     try:
diff --git a/src/fairscape_cli/models/utils.py b/src/fairscape_cli/models/utils.py
index dd5c3a1..4368ca4 100644
--- a/src/fairscape_cli/models/utils.py
+++ b/src/fairscape_cli/models/utils.py
@@ -22,11 +22,17 @@ def setRelativeFilepath(cratePath, filePath):
         return filePath
 
     if 'ro-crate-metadata.json' in str(cratePath):
-        rocratePath = pathlib.Path(cratePath).parent.absolute()
+        rocratePath = pathlib.Path(cratePath).parent.resolve()
     else:
-        rocratePath = pathlib.Path(cratePath).absolute()
-            
-    datasetPath = pathlib.Path(filePath).absolute()
+        rocratePath = pathlib.Path(cratePath).resolve()
+
+    # If filepath is relative, resolve it relative to the crate path, not cwd
+    filePathObj = pathlib.Path(filePath)
+    if not filePathObj.is_absolute():
+        datasetPath = (rocratePath / filePath).resolve()
+    else:
+        datasetPath = filePathObj.resolve()
+
     relativePath = datasetPath.relative_to(rocratePath)
     return f"file:///{str(relativePath)}"
 
diff --git a/src/fairscape_cli/tracking/config.py b/src/fairscape_cli/tracking/config.py
index 0feb2d4..aebc505 100644
--- a/src/fairscape_cli/tracking/config.py
+++ b/src/fairscape_cli/tracking/config.py
@@ -9,6 +9,7 @@ class TrackerConfig:
     track_pathlib: bool = True
     track_pandas: bool = True
     track_numpy: bool = True
+    track_matplotlib: bool = True
     excluded_patterns: List[str] = field(default_factory=lambda: [
         '.matplotlib',
         '.ipython',
@@ -44,3 +45,5 @@ class ProvenanceConfig:
     keywords: List[str] = field(default_factory=lambda: ["jupyter", "computation"])
     manual_inputs: List[str] = field(default_factory=list)
     use_llm: bool = False
+    reference_crates: List[Path] = field(default_factory=list)
+    start_clean: bool = False
diff --git a/src/fairscape_cli/tracking/io_capture.py b/src/fairscape_cli/tracking/io_capture.py
index f02c440..d1a0f98 100644
--- a/src/fairscape_cli/tracking/io_capture.py
+++ b/src/fairscape_cli/tracking/io_capture.py
@@ -179,55 +179,87 @@ def patch_numpy(self):
         """Patch numpy methods to track file I/O."""
         if not self.config.track_numpy:
             return
-            
+
         original_load = np.load
         original_save = np.save
         original_loadtxt = np.loadtxt
         original_savetxt = np.savetxt
-        
+
         self.original_functions['np.load'] = original_load
         self.original_functions['np.save'] = original_save
         self.original_functions['np.loadtxt'] = original_loadtxt
         self.original_functions['np.savetxt'] = original_savetxt
-        
+
         capture = self
-        
+
         def tracked_load(file, *args, **kwargs):
             if capture._should_track(file):
                 capture.inputs.add(capture._normalize_path(file))
             return original_load(file, *args, **kwargs)
-        
+
         def tracked_save(file, arr, *args, **kwargs):
             if capture._should_track(file):
                 capture.outputs.add(capture._normalize_path(file))
             return original_save(file, arr, *args, **kwargs)
-        
+
         def tracked_loadtxt(fname, *args, **kwargs):
             if capture._should_track(fname):
                 capture.inputs.add(capture._normalize_path(fname))
             return original_loadtxt(fname, *args, **kwargs)
-        
+
         def tracked_savetxt(fname, X, *args, **kwargs):
             if capture._should_track(fname):
                 capture.outputs.add(capture._normalize_path(fname))
             return original_savetxt(fname, X, *args, **kwargs)
-        
+
         np.load = tracked_load
         np.save = tracked_save
         np.loadtxt = tracked_loadtxt
         np.savetxt = tracked_savetxt
+
+    def patch_matplotlib(self):
+        """Patch matplotlib methods to track file I/O."""
+        if not self.config.track_matplotlib:
+            return
+
+        try:
+            import matplotlib.pyplot as plt
+            from matplotlib.figure import Figure
+        except ImportError:
+            return
+
+        original_plt_savefig = plt.savefig
+        original_figure_savefig = Figure.savefig
+
+        self.original_functions['plt.savefig'] = original_plt_savefig
+        self.original_functions['Figure.savefig'] = original_figure_savefig
+
+        capture = self
+
+        def tracked_plt_savefig(fname, *args, **kwargs):
+            if capture._should_track(fname):
+                capture.outputs.add(capture._normalize_path(fname))
+            return original_plt_savefig(fname, *args, **kwargs)
+
+        def tracked_figure_savefig(self, fname, *args, **kwargs):
+            if capture._should_track(fname):
+                capture.outputs.add(capture._normalize_path(fname))
+            return original_figure_savefig(self, fname, *args, **kwargs)
+
+        plt.savefig = tracked_plt_savefig
+        Figure.savefig = tracked_figure_savefig
     
     def restore_all(self):
         """Restore all original functions."""
         builtins.open = self.original_functions.get('builtins.open', builtins.open)
-        
+
         if 'pathlib.Path.open' in self.original_functions:
             pathlib.Path.open = self.original_functions['pathlib.Path.open']
             pathlib.Path.read_text = self.original_functions['pathlib.Path.read_text']
             pathlib.Path.read_bytes = self.original_functions['pathlib.Path.read_bytes']
             pathlib.Path.write_text = self.original_functions['pathlib.Path.write_text']
             pathlib.Path.write_bytes = self.original_functions['pathlib.Path.write_bytes']
-        
+
         if 'pd.read_csv' in self.original_functions:
             pd.read_csv = self.original_functions['pd.read_csv']
             pd.read_excel = self.original_functions['pd.read_excel']
@@ -237,18 +269,28 @@ def restore_all(self):
             pd.DataFrame.to_excel = self.original_functions['pd.DataFrame.to_excel']
             pd.DataFrame.to_parquet = self.original_functions['pd.DataFrame.to_parquet']
             pd.DataFrame.to_json = self.original_functions['pd.DataFrame.to_json']
-        
+
         if 'np.load' in self.original_functions:
             np.load = self.original_functions['np.load']
             np.save = self.original_functions['np.save']
             np.loadtxt = self.original_functions['np.loadtxt']
             np.savetxt = self.original_functions['np.savetxt']
-    
+
+        if 'plt.savefig' in self.original_functions:
+            try:
+                import matplotlib.pyplot as plt
+                from matplotlib.figure import Figure
+                plt.savefig = self.original_functions['plt.savefig']
+                Figure.savefig = self.original_functions['Figure.savefig']
+            except ImportError:
+                pass
+
     def __enter__(self):
         self.patch_open()
         self.patch_pathlib()
         self.patch_pandas()
         self.patch_numpy()
+        self.patch_matplotlib()
         return self
     
     def __exit__(self, exc_type, exc_val, exc_tb):
diff --git a/src/fairscape_cli/tracking/metadata_generator.py b/src/fairscape_cli/tracking/metadata_generator.py
index f2bf912..96dab2c 100644
--- a/src/fairscape_cli/tracking/metadata_generator.py
+++ b/src/fairscape_cli/tracking/metadata_generator.py
@@ -11,10 +11,10 @@
 
 INPUT FORMAT:
 - Software code that was executed
-- Input datasets with samples (first 5 rows)
-- Output datasets with samples (first 5 rows)
+- Input datasets/files (may include tabular data samples or image files)
+- Output datasets/files (may include tabular data samples or image files)
 
-OUTPUT FORMAT: JSON object with these keys:
+OUTPUT FORMAT: Return ONLY a valid JSON object with these keys:
 {{
   "software_description": "What this code does technically",
   "computation_description": "What this computation accomplishes",
@@ -30,21 +30,23 @@
 - Software description: 1-2 sentences, focus on operations performed
 - Computation description: 1-2 sentences, focus on scientific/analytical goal
 - Dataset descriptions: 1 sentence each, describe content type and role in workflow
+- For images: Generate figure legend-style descriptions that describe what is shown in the image scientifically
 - Be technical but clear, assume scientific audience
-- No markdown formatting, just plain JSON
+- CRITICAL: Return ONLY valid JSON, no markdown code blocks, no explanations, no trailing commas
+- Ensure all string values properly escape quotes and special characters
 
 SOFTWARE CODE:
 ```python
 {code}
 ```
 
-INPUT DATASETS:
-{input_samples}
+INPUT FILES:
+{input_info}
 
-OUTPUT DATASETS:
-{output_samples}
+OUTPUT FILES:
+{output_info}
 
-Generate the JSON now:"""
+Return the JSON object now (no code blocks, no additional text):"""
 
 
 class MetadataGenerator(ABC):
@@ -63,27 +65,35 @@ def generate_descriptions(
 
 class GeminiMetadataGenerator(MetadataGenerator):
     """Generate metadata using Google Gemini."""
-    
-    def __init__(self, api_key: Optional[str] = None, temperature: float = 0.2, max_tokens: int = 2048):
+
+    def __init__(self, api_key: Optional[str] = None, temperature: float = 0.2, max_tokens: int = 2048, max_images: int = 5):
         self.api_key = api_key or os.environ.get("GEMINI_API_KEY")
         self.temperature = temperature
         self.max_tokens = max_tokens
-        
+        self.max_images = max_images
+
         if not self.api_key:
             raise ValueError("GEMINI_API_KEY not found in environment or provided")
     
-    def _build_prompt(self, code: str, input_samples: str, output_samples: str) -> str:
+    def _is_image_file(self, filepath: str) -> bool:
+        """Check if file is an image."""
+        from pathlib import Path
+        suffix = Path(filepath).suffix.lower()
+        return suffix in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.tif', '.webp']
+
+    def _build_prompt(self, code: str, input_info: str, output_info: str) -> str:
         """Build the prompt for the LLM."""
         return DESCRIPTION_PROMPT_TEMPLATE.format(
             code=code,
-            input_samples=input_samples,
-            output_samples=output_samples
+            input_info=input_info,
+            output_info=output_info
         )
-    
+
     def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
         """Parse JSON from LLM response, handling various formats."""
         response_text = response_text.strip()
-        
+
+        # Try to extract JSON from markdown code blocks
         json_match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
         if json_match:
             response_text = json_match.group(1)
@@ -95,30 +105,140 @@ def _parse_llm_response(self, response_text: str) -> Dict[str, Any]:
                 json_search = re.search(r'\{.*\}', response_text, re.DOTALL)
                 if json_search:
                     response_text = json_search.group(0)
-                    
-        return json.loads(response_text)
-    
+
+        # Clean common JSON formatting issues
+        response_text = response_text.strip()
+
+        # Remove trailing commas before closing braces/brackets
+        response_text = re.sub(r',(\s*[}\]])', r'\1', response_text)
+
+        # Try to parse
+        try:
+            return json.loads(response_text)
+        except json.JSONDecodeError as e:
+            # If parsing fails, try to provide helpful debug info
+            print(f"DEBUG: Failed to parse LLM response. Error: {e}", file=__import__('sys').stderr)
+            print(f"DEBUG: Response text (first 500 chars):\n{response_text[:500]}", file=__import__('sys').stderr)
+            print(f"DEBUG: Response text (last 500 chars):\n{response_text[-500:]}", file=__import__('sys').stderr)
+
+            # Try one more aggressive cleanup: find the largest valid JSON object
+            # by matching braces more carefully
+            brace_count = 0
+            start_idx = response_text.find('{')
+            if start_idx == -1:
+                raise ValueError("No JSON object found in LLM response") from e
+
+            for i in range(start_idx, len(response_text)):
+                if response_text[i] == '{':
+                    brace_count += 1
+                elif response_text[i] == '}':
+                    brace_count -= 1
+                    if brace_count == 0:
+                        # Found matching closing brace
+                        candidate = response_text[start_idx:i+1]
+                        # Clean and try again
+                        candidate = re.sub(r',(\s*[}\]])', r'\1', candidate)
+                        try:
+                            return json.loads(candidate)
+                        except json.JSONDecodeError:
+                            pass
+
+            # If all else fails, re-raise original error
+            raise
+
     def generate_descriptions(
-        self, 
-        code: str, 
-        input_samples: str, 
-        output_samples: str
+        self,
+        code: str,
+        input_files: Dict[str, str],
+        output_files: Dict[str, str]
     ) -> Optional[Dict[str, Any]]:
-        """Generate descriptions using Gemini."""
+        """Generate descriptions using Gemini with support for images.
+
+        Args:
+            code: The source code
+            input_files: Dict mapping filename to filepath
+            output_files: Dict mapping filename to filepath
+        """
         import google.generativeai as genai
-        
+        from pathlib import Path
+        from PIL import Image
+
         genai.configure(api_key=self.api_key)
         model = genai.GenerativeModel('gemini-2.5-flash')
-        
-        prompt = self._build_prompt(code, input_samples, output_samples)
+
+        # Separate image files from other files
+        input_images = {k: v for k, v in input_files.items() if self._is_image_file(v)}
+        output_images = {k: v for k, v in output_files.items() if self._is_image_file(v)}
+
+        # Limit total images to avoid Gemini safety filters
+        # Reserve 2 for inputs, rest for outputs (CAM visualizations are more important)
+        total_image_budget = self.max_images
+        max_input_images = min(2, len(input_images))
+        max_output_images = min(total_image_budget - max_input_images, len(output_images))
+
+        input_images = dict(list(input_images.items())[:max_input_images])
+        output_images = dict(list(output_images.items())[:max_output_images])
+
+        # Build text descriptions for files
+        input_info_parts = []
+        for filename in input_files.keys():
+            if filename in input_images:
+                input_info_parts.append(f"- {filename} (image - see attached)")
+            else:
+                input_info_parts.append(f"- {filename}")
+
+        output_info_parts = []
+        for filename in output_files.keys():
+            if filename in output_images:
+                output_info_parts.append(f"- {filename} (image - see attached)")
+            else:
+                output_info_parts.append(f"- {filename}")
+
+        input_info = "\n".join(input_info_parts) if input_info_parts else "None"
+        output_info = "\n".join(output_info_parts) if output_info_parts else "None"
+
+        prompt = self._build_prompt(code, input_info, output_info)
+
+        # Build content list with prompt and images
+        content = [prompt]
+
+        # Add input images
+        for filename, filepath in input_images.items():
+            try:
+                img = Image.open(filepath)
+                content.append(img)
+                content.append(f"Input image: {filename}")
+            except Exception as e:
+                print(f"WARNING: Could not load input image {filename}: {e}", file=__import__('sys').stderr)
+
+        # Add output images
+        for filename, filepath in output_images.items():
+            try:
+                img = Image.open(filepath)
+                content.append(img)
+                content.append(f"Output image: {filename}")
+            except Exception as e:
+                print(f"WARNING: Could not load output image {filename}: {e}", file=__import__('sys').stderr)
+
         response = model.generate_content(
-            prompt,
+            content,
             generation_config=genai.types.GenerationConfig(
                 temperature=self.temperature,
                 max_output_tokens=self.max_tokens,
+                response_mime_type="application/json",
             )
         )
-        
+
+        # Check if response was blocked by safety filters
+        if not response.candidates or not response.candidates[0].content.parts:
+            finish_reason = response.candidates[0].finish_reason if response.candidates else None
+            raise ValueError(
+                f"Gemini blocked the response (finish_reason={finish_reason}). "
+                f"This usually means too many images were sent. "
+                f"Sent {len(input_images)} input images + {len(output_images)} output images. "
+                f"Try reducing max_images parameter (currently {self.max_images})."
+            )
+
         return self._parse_llm_response(response.text)
 
 
@@ -168,7 +288,7 @@ def create_metadata_generator(
     
     if provider == "gemini":
         try:
-            gemini_kwargs = {k: v for k, v in kwargs.items() if k in ['temperature', 'max_tokens']}
+            gemini_kwargs = {k: v for k, v in kwargs.items() if k in ['temperature', 'max_tokens', 'max_images']}
             return GeminiMetadataGenerator(api_key=api_key, **gemini_kwargs)
         except ValueError:
             print("WARNING: GEMINI_API_KEY not found, using fallback descriptions")
diff --git a/src/fairscape_cli/tracking/provenance_tracker.py b/src/fairscape_cli/tracking/provenance_tracker.py
index f27aa88..ddf24c7 100644
--- a/src/fairscape_cli/tracking/provenance_tracker.py
+++ b/src/fairscape_cli/tracking/provenance_tracker.py
@@ -1,6 +1,8 @@
 from pathlib import Path
 from datetime import datetime
 from typing import List, Dict, Set, Optional
+import json
+import sys
 
 from fairscape_cli.models.rocrate import ReadROCrateMetadata, AppendCrate
 from fairscape_cli.models.dataset import GenerateDataset, Dataset
@@ -17,7 +19,7 @@
 
 
 class ProvenanceTracker:
-    
+
     def __init__(
         self,
         config: ProvenanceConfig,
@@ -28,9 +30,14 @@ def __init__(
         self.filepath_to_guid: Dict[str, str] = {}
         self.existing_guids: Set[str] = set()
         self.crate_metadata = None
-        
+
+        self.reference_entities: Dict[str, tuple] = {}
+
         self._ensure_crate_exists()
+        if self.config.start_clean:
+            self._clear_graph()
         self._load_crate_context()
+        self._load_reference_crates()
     
     def _ensure_crate_exists(self):
         metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json'
@@ -43,7 +50,7 @@ def _ensure_crate_exists(self):
         
         GenerateROCrate(
             path=self.config.rocrate_path,
-            guid=None,  
+            guid=None,
             name=placeholder_name,
             description=placeholder_description,
             author=self.config.author if self.config.author != "Unknown" else "Researcher",
@@ -53,7 +60,39 @@ def _ensure_crate_exists(self):
             license="https://creativecommons.org/licenses/by/4.0/",
             isPartOf=[]
         )
-    
+
+    def _clear_graph(self):
+        """Clear existing @graph entries except the metadata descriptor and root dataset."""
+        metadata_path = self.config.rocrate_path / 'ro-crate-metadata.json'
+
+        if not metadata_path.exists():
+            return
+
+        try:
+            with metadata_path.open('r') as f:
+                crate_data = json.load(f)
+
+            graph = crate_data.get('@graph', [])
+            if len(graph) <= 2:
+                # Only metadata descriptor and root dataset, nothing to clear
+                return
+
+            # Keep only first two elements: metadata descriptor and root dataset
+            crate_data['@graph'] = graph[:2]
+
+            # Clear hasPart in root dataset
+            if len(crate_data['@graph']) > 1:
+                root_dataset = crate_data['@graph'][1]
+                if 'hasPart' in root_dataset:
+                    root_dataset['hasPart'] = []
+
+            with metadata_path.open('w') as f:
+                json.dump(crate_data, f, indent=2)
+
+            print(f"Cleared existing @graph entries from {metadata_path}")
+
+        except Exception as e:
+            print(f"WARNING: Could not clear @graph: {e}")
 
     def _load_crate_context(self):
         try:
@@ -86,7 +125,32 @@ def _load_crate_context(self):
             
         except Exception as e:
             raise RuntimeError(f"Could not read RO-Crate at {self.config.rocrate_path}: {e}")
-    
+
+    def _load_reference_crates(self):
+        """Load reference RO-Crates to look up existing ARKs for input files."""
+        for ref_crate_path in self.config.reference_crates:
+            try:
+                ref_metadata = ReadROCrateMetadata(ref_crate_path)
+                indexed_count = 0
+
+                for entity in ref_metadata['@graph']:
+                    entity_guid = getattr(entity, 'guid', None)
+                    if not entity_guid:
+                        continue
+
+                    content_url = getattr(entity, 'contentUrl', None)
+                    if content_url and content_url.startswith('file://'):
+                        relative_path = content_url.replace('file:///', '').lstrip('/')
+                        filepath_full = (ref_crate_path / relative_path).resolve()
+
+                        self.reference_entities[str(filepath_full)] = (entity_guid, entity)
+                        indexed_count += 1
+
+                print(f"Loaded reference crate: {ref_crate_path} ({indexed_count} entities)")
+
+            except Exception as e:
+                print(f"WARNING: Could not load reference crate {ref_crate_path}: {e}")
+
     def _resolve_manual_inputs(self) -> Set[str]:
         manual_input_paths = set()
         for manual_input in self.config.manual_inputs:
@@ -101,35 +165,57 @@ def _resolve_manual_inputs(self) -> Set[str]:
     def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]:
         all_input_files = set(io_capture.inputs)
         all_input_files.update(self._resolve_manual_inputs())
-        
+
+        # Build set of output files to detect intermediate files
+        output_files = {str(Path(f).resolve()) for f in io_capture.outputs}
+
         input_datasets = []
         reused_count = 0
-        
+
         for input_file in all_input_files:
             input_path = Path(input_file)
-            
+
             if not input_path.exists():
                 print(f"WARNING: Input file does not exist: {input_file}")
                 continue
-            
+
             normalized_path = input_path.resolve()
-            
+
+            # Skip files that were also written during this execution (intermediate files)
+            if str(normalized_path) in output_files:
+                print(f"INFO: Skipping intermediate file (written then read): {input_path.name}")
+                continue
+
+            # Check if file exists in current crate
             if str(normalized_path) in self.filepath_to_guid:
                 existing_guid = self.filepath_to_guid[str(normalized_path)]
-                print(f"Reusing existing dataset: {input_path.name} ({existing_guid})")
-                
+
                 existing_dataset = next(
                     (e for e in self.crate_metadata['@graph'] if getattr(e, 'guid', None) == existing_guid),
                     None
                 )
                 if existing_dataset:
-                    dataset_obj =Dataset.model_validate(existing_dataset)
+                    dataset_obj = Dataset.model_validate(existing_dataset)
                     input_datasets.append(dataset_obj)
                     reused_count += 1
                 continue
-            
-            rel_path = input_path.relative_to(self.config.rocrate_path) if input_path.is_relative_to(self.config.rocrate_path) else input_path
-            
+
+            # Check if file exists in reference crates
+            if str(normalized_path) in self.reference_entities:
+                ref_guid, ref_entity = self.reference_entities[str(normalized_path)]
+
+                ref_entity._is_reference = True
+                input_datasets.append(ref_entity)
+                reused_count += 1
+                continue
+
+            # File not in any crate - create new dataset if it's within rocrate_path
+            if not input_path.is_relative_to(self.config.rocrate_path):
+                # Skip files outside the crate path silently (e.g., /dev/null)
+                continue
+
+            rel_path = input_path.relative_to(self.config.rocrate_path)
+
             dataset_metadata = GenerateDataset(
                 name=input_path.name,
                 author=self.config.author,
@@ -147,12 +233,16 @@ def _resolve_inputs(self, io_capture: IOCapture) -> tuple[List[Dataset], int]:
     
     def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]:
         output_datasets = []
-        
+
         for output_file in io_capture.outputs:
-            output_path = Path(output_file)
-            
-            rel_path = output_path.relative_to(self.config.rocrate_path) if output_path.is_relative_to(self.config.rocrate_path) else output_path
-            
+            output_path = Path(output_file).resolve()
+
+            # Skip files outside the crate path (e.g., /dev/null)
+            if not output_path.is_relative_to(self.config.rocrate_path):
+                continue
+
+            rel_path = output_path.relative_to(self.config.rocrate_path)
+
             dataset_metadata = GenerateDataset(
                 name=output_path.name,
                 author=self.config.author,
@@ -166,9 +256,9 @@ def _resolve_outputs(self, io_capture: IOCapture) -> List[Dataset]:
                 cratePath=self.config.rocrate_path
             )
             output_datasets.append(dataset_metadata)
-        
+
         return output_datasets
-    
+
     def _enhance_with_llm(
         self,
         code: str,
@@ -177,40 +267,52 @@ def _enhance_with_llm(
     ) -> Optional[Dict]:
 
         if not self.metadata_generator:
+            print("INFO: No metadata generator available - skipping LLM enhancement", file=sys.stderr)
             return None
-        
-        all_input_files = set()
+
+        print(f"INFO: Starting LLM enhancement with {len(input_datasets)} inputs and {len(output_datasets)} outputs", file=sys.stderr)
+
+        # Collect file paths with their filenames
+        input_files = {}  # filename -> filepath
         for ds in input_datasets:
             content_url = getattr(ds, 'contentUrl', None)
             if content_url:
                 url = content_url if isinstance(content_url, str) else content_url[0]
                 filepath = url.replace('file:///', '').replace('file:', '')
                 full_path = str((self.config.rocrate_path / filepath).resolve())
-                all_input_files.add(full_path)
+                filename = Path(full_path).name
+                input_files[filename] = full_path
 
-        all_output_files = set()
+        output_files = {}  # filename -> filepath
         for ds in output_datasets:
             content_url = getattr(ds, 'contentUrl', None)
             if content_url:
                 url = content_url if isinstance(content_url, str) else content_url[0]
-                filepath = url.replace('file://', '').replace('file:', '')
+                filepath = url.replace('file:///', '').replace('file:', '')
                 full_path = str((self.config.rocrate_path / filepath).resolve())
-                all_output_files.add(full_path)
-        
-        input_samples = collect_dataset_samples(all_input_files)
-        output_samples = collect_dataset_samples(all_output_files)
-        
-        if not input_samples and not output_samples:
+                filename = Path(full_path).name
+                output_files[filename] = full_path
+
+        print(f"INFO: Collected {len(input_files)} input files and {len(output_files)} output files", file=sys.stderr)
+
+        # Call LLM even if there are no files - it can still describe the code
+        if not input_files and not output_files:
+            print("INFO: No files found, but calling LLM to describe code anyway", file=sys.stderr)
+
+        print("INFO: Calling LLM to generate descriptions...", file=sys.stderr)
+        try:
+            result = self.metadata_generator.generate_descriptions(
+                code,
+                input_files,
+                output_files
+            )
+            print("INFO: LLM successfully generated descriptions", file=sys.stderr)
+            return result
+        except Exception as exc:
+            print(f"WARNING: LLM description generation failed: {exc}", file=sys.stderr)
+            import traceback
+            traceback.print_exc(file=sys.stderr)
             return None
-        
-        input_samples_str = format_samples_for_prompt(input_samples)
-        output_samples_str = format_samples_for_prompt(output_samples)
-        
-        return self.metadata_generator.generate_descriptions(
-            code, 
-            input_samples_str, 
-            output_samples_str
-        )
     
     def _apply_llm_descriptions(
         self,
@@ -291,14 +393,14 @@ def track_execution(
     ) -> TrackingResult:
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         execution_name = execution_name or f"cell_{timestamp}"
-        
+
         if not io_capture.inputs and not io_capture.outputs and not self.config.manual_inputs:
             print("No file I/O detected in execution")
             raise ValueError("No file I/O detected")
-        
+
         input_datasets, reused_count = self._resolve_inputs(io_capture)
         output_datasets = self._resolve_outputs(io_capture)
-        
+
         llm_descriptions = self._enhance_with_llm(code, input_datasets, output_datasets)
         
         software_description, computation_description = self._apply_llm_descriptions(
@@ -318,8 +420,12 @@ def track_execution(
             output_datasets
         )
         
-        new_datasets = [ds for ds in input_datasets if ds.guid not in self.existing_guids]
-        
+        # Filter out datasets that already exist in current crate OR are references to external crates
+        new_datasets = [
+            ds for ds in input_datasets
+            if ds.guid not in self.existing_guids and not getattr(ds, '_is_reference', False)
+        ]
+
         elements = [software] + new_datasets + output_datasets + [computation]
         AppendCrate(cratePath=self.config.rocrate_path, elements=elements)