Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 77 additions & 23 deletions src/fairscape_cli/commands/track.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,31 @@
import click
import pathlib
import os
import sys
import runpy
from typing import List
from typing import List, Tuple

from fairscape_cli.tracking.io_capture import IOCapture
from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig
from fairscape_cli.tracking.metadata_generator import create_metadata_generator


@click.command('track')
@click.command('track', context_settings=dict(
ignore_unknown_options=True,
allow_extra_args=True,
))
@click.argument('script-path', type=click.Path(exists=True, path_type=pathlib.Path))
@click.option('--rocrate-path', type=click.Path(path_type=pathlib.Path), default=None, help='Path to RO-Crate directory (default: current directory)')
@click.option('--author', type=str, default="Unknown", help='Author name (default: from RO-Crate or "Unknown")')
@click.option('--keywords', multiple=True, default=["computation"], help='Keywords for metadata (default: from RO-Crate or ["computation"])')
@click.option('--input', 'manual_inputs', multiple=True, help='Manual input files to track')
@click.option('--no-llm', is_flag=True, default=False, help='Disable LLM-based description generation')
@click.option('--max-images', type=int, default=5, help='Maximum number of images to send to LLM for analysis (default: 5)')
@click.option('--execution-name', type=str, default=None, help='Name for this execution (default: script filename)')
@click.option('--reference-crate', 'reference_crates', multiple=True, type=click.Path(exists=True, path_type=pathlib.Path), help='Reference RO-Crate(s) to look up existing ARKs for input files')
@click.option('--start-clean', is_flag=True, default=False, help='Clear existing @graph entries (except root) before tracking')
@click.argument('script_args', nargs=-1, type=click.UNPROCESSED)
@click.pass_context
def track(
ctx,
Expand All @@ -27,27 +35,37 @@ def track(
keywords: List[str],
manual_inputs: List[str],
no_llm: bool,
execution_name: str
max_images: int,
execution_name: str,
reference_crates: Tuple[pathlib.Path, ...],
start_clean: bool,
script_args: Tuple[str, ...]
):
"""Track execution of a Python script and generate provenance metadata.

Executes SCRIPT_PATH while capturing file I/O operations, then generates
RO-Crate metadata documenting the computation, software, input datasets,
and output datasets.


Script arguments can be passed after the script path and options.

Examples:

fairscape-cli track analysis.py

fairscape-cli track analysis.py --author "Jane Doe" --keywords ml analysis

fairscape-cli track analysis.py --rocrate-path ./my-crate --input config.json

fairscape-cli track analysis.py --no-llm --author "John Smith"

fairscape-cli track script.py -- ./output_dir --inputdir ./input_dir

fairscape-cli track script.py --reference-crate ./input_data_crate -- ./output --inputdir ./input_data_crate/data
"""

rocrate_path = rocrate_path or pathlib.Path.cwd()
rocrate_path = (rocrate_path or pathlib.Path.cwd()).resolve()

if not script_path.exists():
click.echo(f"ERROR: Script file not found: {script_path}", err=True)
ctx.exit(code=1)
Expand All @@ -60,40 +78,74 @@ def track(
ctx.exit(code=1)

tracker_config = TrackerConfig()

original_cwd = pathlib.Path.cwd()
script_dir = script_path.parent.resolve()


script_path_resolved = script_path.resolve()

resolved_args = []
for arg in script_args:
arg_path = pathlib.Path(arg)
if arg_path.exists() or (not arg.startswith('-') and ('/' in arg or '\\' in arg)):
resolved_args.append(str((original_cwd / arg).resolve()))
else:
resolved_args.append(arg)

original_argv = sys.argv
sys.argv = [str(script_path_resolved)] + resolved_args

try:
os.chdir(script_dir)

with IOCapture(config=tracker_config) as capture:
try:
runpy.run_path(str(script_path), run_name='__main__')
runpy.run_path(str(script_path_resolved), run_name='__main__')
except SystemExit as e:
if e.code != 0:
click.echo(f"WARNING: Script exited with code {e.code}", err=True)
except Exception as exc:
click.echo(f"ERROR: Script execution failed: {exc}", err=True)
ctx.exit(code=1)
finally:
os.chdir(original_cwd)
sys.argv = original_argv

if not capture.inputs and not capture.outputs and not manual_inputs:
click.echo("WARNING: No file I/O detected in script execution", err=True)
click.echo("No metadata generated.", err=True)
return

use_llm = not no_llm and os.environ.get("GEMINI_API_KEY")


# Log LLM decision
if no_llm:
click.echo("INFO: LLM disabled via --no-llm flag", err=True)
elif not os.environ.get("GEMINI_API_KEY"):
click.echo("INFO: LLM disabled - GEMINI_API_KEY environment variable not set", err=True)
click.echo("INFO: Set GEMINI_API_KEY to enable AI-generated descriptions", err=True)
else:
click.echo("INFO: LLM enabled - will generate AI descriptions", err=True)

metadata_generator = None
if use_llm:
from datetime import datetime
try:
# Test if google.generativeai is available
try:
import google.generativeai as genai
click.echo("INFO: google-generativeai package found", err=True)
except ImportError as imp_exc:
click.echo(f"WARNING: google-generativeai package not installed: {imp_exc}", err=True)
click.echo("INFO: Install with: pip install google-generativeai", err=True)
click.echo("Falling back to simple descriptions", err=True)
raise

metadata_generator = create_metadata_generator(
provider="gemini",
timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
timestamp=datetime.now().strftime("%Y%m%d_%H%M%S"),
max_images=max_images
)
click.echo("INFO: LLM metadata generator initialized successfully", err=True)
except ImportError:
# Already logged above
pass
except Exception as exc:
click.echo(f"WARNING: Could not initialize LLM metadata generator: {exc}", err=True)
click.echo("Falling back to simple descriptions", err=True)
Expand All @@ -103,7 +155,9 @@ def track(
author=author,
keywords=list(keywords),
manual_inputs=list(manual_inputs),
use_llm=use_llm
use_llm=use_llm,
reference_crates=list(reference_crates),
start_clean=start_clean
)

try:
Expand Down
14 changes: 10 additions & 4 deletions src/fairscape_cli/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,17 @@ def setRelativeFilepath(cratePath, filePath):
return filePath

if 'ro-crate-metadata.json' in str(cratePath):
rocratePath = pathlib.Path(cratePath).parent.absolute()
rocratePath = pathlib.Path(cratePath).parent.resolve()
else:
rocratePath = pathlib.Path(cratePath).absolute()

datasetPath = pathlib.Path(filePath).absolute()
rocratePath = pathlib.Path(cratePath).resolve()

# If filepath is relative, resolve it relative to the crate path, not cwd
filePathObj = pathlib.Path(filePath)
if not filePathObj.is_absolute():
datasetPath = (rocratePath / filePath).resolve()
else:
datasetPath = filePathObj.resolve()

relativePath = datasetPath.relative_to(rocratePath)
return f"file:///{str(relativePath)}"

Expand Down
3 changes: 3 additions & 0 deletions src/fairscape_cli/tracking/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class TrackerConfig:
track_pathlib: bool = True
track_pandas: bool = True
track_numpy: bool = True
track_matplotlib: bool = True
excluded_patterns: List[str] = field(default_factory=lambda: [
'.matplotlib',
'.ipython',
Expand Down Expand Up @@ -44,3 +45,5 @@ class ProvenanceConfig:
keywords: List[str] = field(default_factory=lambda: ["jupyter", "computation"])
manual_inputs: List[str] = field(default_factory=list)
use_llm: bool = False
reference_crates: List[Path] = field(default_factory=list)
start_clean: bool = False
66 changes: 54 additions & 12 deletions src/fairscape_cli/tracking/io_capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,55 +179,87 @@ def patch_numpy(self):
"""Patch numpy methods to track file I/O."""
if not self.config.track_numpy:
return

original_load = np.load
original_save = np.save
original_loadtxt = np.loadtxt
original_savetxt = np.savetxt

self.original_functions['np.load'] = original_load
self.original_functions['np.save'] = original_save
self.original_functions['np.loadtxt'] = original_loadtxt
self.original_functions['np.savetxt'] = original_savetxt

capture = self

def tracked_load(file, *args, **kwargs):
if capture._should_track(file):
capture.inputs.add(capture._normalize_path(file))
return original_load(file, *args, **kwargs)

def tracked_save(file, arr, *args, **kwargs):
if capture._should_track(file):
capture.outputs.add(capture._normalize_path(file))
return original_save(file, arr, *args, **kwargs)

def tracked_loadtxt(fname, *args, **kwargs):
if capture._should_track(fname):
capture.inputs.add(capture._normalize_path(fname))
return original_loadtxt(fname, *args, **kwargs)

def tracked_savetxt(fname, X, *args, **kwargs):
if capture._should_track(fname):
capture.outputs.add(capture._normalize_path(fname))
return original_savetxt(fname, X, *args, **kwargs)

np.load = tracked_load
np.save = tracked_save
np.loadtxt = tracked_loadtxt
np.savetxt = tracked_savetxt

def patch_matplotlib(self):
"""Patch matplotlib methods to track file I/O."""
if not self.config.track_matplotlib:
return

try:
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
except ImportError:
return

original_plt_savefig = plt.savefig
original_figure_savefig = Figure.savefig

self.original_functions['plt.savefig'] = original_plt_savefig
self.original_functions['Figure.savefig'] = original_figure_savefig

capture = self

def tracked_plt_savefig(fname, *args, **kwargs):
if capture._should_track(fname):
capture.outputs.add(capture._normalize_path(fname))
return original_plt_savefig(fname, *args, **kwargs)

def tracked_figure_savefig(self, fname, *args, **kwargs):
if capture._should_track(fname):
capture.outputs.add(capture._normalize_path(fname))
return original_figure_savefig(self, fname, *args, **kwargs)

plt.savefig = tracked_plt_savefig
Figure.savefig = tracked_figure_savefig

def restore_all(self):
"""Restore all original functions."""
builtins.open = self.original_functions.get('builtins.open', builtins.open)

if 'pathlib.Path.open' in self.original_functions:
pathlib.Path.open = self.original_functions['pathlib.Path.open']
pathlib.Path.read_text = self.original_functions['pathlib.Path.read_text']
pathlib.Path.read_bytes = self.original_functions['pathlib.Path.read_bytes']
pathlib.Path.write_text = self.original_functions['pathlib.Path.write_text']
pathlib.Path.write_bytes = self.original_functions['pathlib.Path.write_bytes']

if 'pd.read_csv' in self.original_functions:
pd.read_csv = self.original_functions['pd.read_csv']
pd.read_excel = self.original_functions['pd.read_excel']
Expand All @@ -237,18 +269,28 @@ def restore_all(self):
pd.DataFrame.to_excel = self.original_functions['pd.DataFrame.to_excel']
pd.DataFrame.to_parquet = self.original_functions['pd.DataFrame.to_parquet']
pd.DataFrame.to_json = self.original_functions['pd.DataFrame.to_json']

if 'np.load' in self.original_functions:
np.load = self.original_functions['np.load']
np.save = self.original_functions['np.save']
np.loadtxt = self.original_functions['np.loadtxt']
np.savetxt = self.original_functions['np.savetxt']


if 'plt.savefig' in self.original_functions:
try:
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
plt.savefig = self.original_functions['plt.savefig']
Figure.savefig = self.original_functions['Figure.savefig']
except ImportError:
pass

def __enter__(self):
self.patch_open()
self.patch_pathlib()
self.patch_pandas()
self.patch_numpy()
self.patch_matplotlib()
return self

def __exit__(self, exc_type, exc_val, exc_tb):
Expand Down
Loading
Loading