Skip to content

Commit 0263b6b

Browse files
authored
Merge pull request #49 from fairscape/script-tracking
Script tracking
2 parents ae70ecf + b7f8b30 commit 0263b6b

11 files changed

Lines changed: 1159 additions & 2 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "fairscape-cli"
7-
version = "1.1.7"
7+
version = "1.1.9"
88
description = "A utility for packaging objects and validating metadata for FAIRSCAPE"
99
readme = "README.md"
1010
requires-python = ">=3.8"
@@ -39,7 +39,7 @@ dependencies = [
3939
"prettytable>=3.9.0",
4040
"jsonschema>=4.20.0",
4141
"sqids>=0.4.1",
42-
"fairscape-models>=1.0.8",
42+
"fairscape-models>=1.0.11",
4343
"pyyaml",
4444
"h5py",
4545
"frictionless>=5.0,<6.0",

src/fairscape_cli/__main__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from fairscape_cli.commands.publish_commands import publish_group
88
from fairscape_cli.commands.schema_commands import schema
99
from fairscape_cli.commands.augment_commands import augment_group
10+
from fairscape_cli.commands.track import track
1011

1112
@click.group(invoke_without_command=True)
1213
@click.pass_context
@@ -25,6 +26,7 @@ def cli(ctx):
2526
cli.add_command(publish_group, name='publish')
2627
cli.add_command(schema, name='schema')
2728
cli.add_command(augment_group, name='augment')
29+
cli.add_command(track, name='track')
2830

2931
if __name__ == "__main__":
3032
cli()
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import click
2+
import pathlib
3+
import os
4+
import runpy
5+
from typing import List
6+
7+
from fairscape_cli.tracking.io_capture import IOCapture
8+
from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
9+
from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig
10+
from fairscape_cli.tracking.metadata_generator import create_metadata_generator
11+
12+
13+
@click.command('track')
14+
@click.argument('script-path', type=click.Path(exists=True, path_type=pathlib.Path))
15+
@click.option('--rocrate-path', type=click.Path(path_type=pathlib.Path), default=None, help='Path to RO-Crate directory (default: current directory)')
16+
@click.option('--author', type=str, default="Unknown", help='Author name (default: from RO-Crate or "Unknown")')
17+
@click.option('--keywords', multiple=True, default=["computation"], help='Keywords for metadata (default: from RO-Crate or ["computation"])')
18+
@click.option('--input', 'manual_inputs', multiple=True, help='Manual input files to track')
19+
@click.option('--no-llm', is_flag=True, default=False, help='Disable LLM-based description generation')
20+
@click.option('--execution-name', type=str, default=None, help='Name for this execution (default: script filename)')
21+
@click.pass_context
22+
def track(
23+
ctx,
24+
script_path: pathlib.Path,
25+
rocrate_path: pathlib.Path,
26+
author: str,
27+
keywords: List[str],
28+
manual_inputs: List[str],
29+
no_llm: bool,
30+
execution_name: str
31+
):
32+
"""Track execution of a Python script and generate provenance metadata.
33+
34+
Executes SCRIPT_PATH while capturing file I/O operations, then generates
35+
RO-Crate metadata documenting the computation, software, input datasets,
36+
and output datasets.
37+
38+
Examples:
39+
40+
fairscape-cli track analysis.py
41+
42+
fairscape-cli track analysis.py --author "Jane Doe" --keywords ml analysis
43+
44+
fairscape-cli track analysis.py --rocrate-path ./my-crate --input config.json
45+
46+
fairscape-cli track analysis.py --no-llm --author "John Smith"
47+
"""
48+
49+
rocrate_path = rocrate_path or pathlib.Path.cwd()
50+
51+
if not script_path.exists():
52+
click.echo(f"ERROR: Script file not found: {script_path}", err=True)
53+
ctx.exit(code=1)
54+
55+
try:
56+
with script_path.open('r') as f:
57+
code = f.read()
58+
except Exception as exc:
59+
click.echo(f"ERROR: Could not read script file: {exc}", err=True)
60+
ctx.exit(code=1)
61+
62+
tracker_config = TrackerConfig()
63+
64+
original_cwd = pathlib.Path.cwd()
65+
script_dir = script_path.parent.resolve()
66+
67+
try:
68+
os.chdir(script_dir)
69+
70+
with IOCapture(config=tracker_config) as capture:
71+
try:
72+
runpy.run_path(str(script_path), run_name='__main__')
73+
except SystemExit as e:
74+
if e.code != 0:
75+
click.echo(f"WARNING: Script exited with code {e.code}", err=True)
76+
except Exception as exc:
77+
click.echo(f"ERROR: Script execution failed: {exc}", err=True)
78+
ctx.exit(code=1)
79+
finally:
80+
os.chdir(original_cwd)
81+
82+
if not capture.inputs and not capture.outputs and not manual_inputs:
83+
click.echo("WARNING: No file I/O detected in script execution", err=True)
84+
click.echo("No metadata generated.", err=True)
85+
return
86+
87+
use_llm = not no_llm and os.environ.get("GEMINI_API_KEY")
88+
89+
metadata_generator = None
90+
if use_llm:
91+
from datetime import datetime
92+
try:
93+
metadata_generator = create_metadata_generator(
94+
provider="gemini",
95+
timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
96+
)
97+
except Exception as exc:
98+
click.echo(f"WARNING: Could not initialize LLM metadata generator: {exc}", err=True)
99+
click.echo("Falling back to simple descriptions", err=True)
100+
101+
provenance_config = ProvenanceConfig(
102+
rocrate_path=rocrate_path,
103+
author=author,
104+
keywords=list(keywords),
105+
manual_inputs=list(manual_inputs),
106+
use_llm=use_llm
107+
)
108+
109+
try:
110+
tracker = ProvenanceTracker(
111+
config=provenance_config,
112+
metadata_generator=metadata_generator
113+
)
114+
115+
exec_name = execution_name or script_path.stem
116+
117+
result = tracker.track_execution(code, capture, execution_name=exec_name)
118+
119+
click.echo(result.computation_guid)
120+
121+
if ctx.obj and ctx.obj.get('verbose'):
122+
click.echo(f"\nTracking Summary:", err=True)
123+
click.echo(f" Software: {result.software_guid}", err=True)
124+
click.echo(f" Inputs: {result.input_count} datasets ({result.reused_count} reused)", err=True)
125+
click.echo(f" Outputs: {result.output_count} datasets", err=True)
126+
127+
except ValueError as exc:
128+
click.echo(f"ERROR: {exc}", err=True)
129+
ctx.exit(code=1)
130+
except RuntimeError as exc:
131+
click.echo(f"ERROR: {exc}", err=True)
132+
ctx.exit(code=1)
133+
except Exception as exc:
134+
click.echo(f"ERROR: Tracking failed: {exc}", err=True)
135+
ctx.exit(code=1)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .magic import fairscape
2+
3+
__all__ = ['fairscape']

src/fairscape_cli/jupyter/magic.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import os
2+
import pathlib
3+
import argparse
4+
from IPython.core.magic import register_cell_magic
5+
from IPython import get_ipython
6+
7+
from fairscape_cli.tracking.io_capture import IOCapture
8+
from fairscape_cli.tracking.provenance_tracker import ProvenanceTracker
9+
from fairscape_cli.tracking.config import ProvenanceConfig, TrackerConfig
10+
from fairscape_cli.tracking.metadata_generator import create_metadata_generator
11+
12+
13+
def parse_magic_arguments(line: str) -> argparse.Namespace:
14+
"""Parse arguments from magic command line."""
15+
parser = argparse.ArgumentParser(description='Track Jupyter cell execution')
16+
parser.add_argument('command', nargs='?', default=None)
17+
parser.add_argument('--rocrate-path', type=str, default=None)
18+
parser.add_argument('--author', type=str, default="Unknown")
19+
parser.add_argument('--keywords', nargs='+', default=["jupyter", "computation"])
20+
parser.add_argument('--input', nargs='+', default=[], dest='manual_inputs')
21+
parser.add_argument('--no-llm', action='store_true', help='Disable LLM descriptions')
22+
23+
args_list = line.split()
24+
25+
try:
26+
args = parser.parse_args(args_list)
27+
except SystemExit:
28+
print("Usage: %%fairscape track [--rocrate-path PATH] [--author AUTHOR] [--keywords KW1 KW2] [--input FILE1 FILE2] [--no-llm]")
29+
raise
30+
31+
return args
32+
33+
34+
def execute_cell_safely(cell: str) -> bool:
35+
"""Execute cell and return success status."""
36+
ip = get_ipython()
37+
result = ip.run_cell(cell)
38+
39+
if result.error_in_exec:
40+
print("ERROR: Cell execution failed")
41+
return False
42+
43+
return True
44+
45+
46+
@register_cell_magic
47+
def fairscape(line, cell):
48+
"""
49+
Jupyter cell magic for tracking computational provenance.
50+
51+
Usage:
52+
%%fairscape track [options]
53+
<your code here>
54+
55+
Options:
56+
--rocrate-path PATH Path to RO-Crate directory (default: current directory)
57+
--author AUTHOR Author name (default: from RO-Crate or "Unknown")
58+
--keywords KW1 KW2 Keywords for metadata (default: from RO-Crate or ["jupyter", "computation"])
59+
--input FILE1 FILE2 Manual input files to track
60+
--no-llm Disable LLM-based description generation
61+
"""
62+
args = parse_magic_arguments(line)
63+
64+
if args.command != 'track':
65+
print("Usage: %%fairscape track [options]")
66+
return
67+
68+
rocrate_path = pathlib.Path(args.rocrate_path) if args.rocrate_path else pathlib.Path.cwd()
69+
70+
tracker_config = TrackerConfig()
71+
72+
with IOCapture(config=tracker_config) as capture:
73+
if not execute_cell_safely(cell):
74+
return
75+
76+
use_llm = not args.no_llm and os.environ.get("GEMINI_API_KEY")
77+
78+
metadata_generator = None
79+
if use_llm:
80+
from datetime import datetime
81+
metadata_generator = create_metadata_generator(
82+
provider="gemini",
83+
timestamp=datetime.now().strftime("%Y%m%d_%H%M%S")
84+
)
85+
86+
provenance_config = ProvenanceConfig(
87+
rocrate_path=rocrate_path,
88+
author=args.author,
89+
keywords=args.keywords,
90+
manual_inputs=args.manual_inputs,
91+
use_llm=use_llm
92+
)
93+
94+
try:
95+
tracker = ProvenanceTracker(
96+
config=provenance_config,
97+
metadata_generator=metadata_generator
98+
)
99+
100+
result = tracker.track_execution(cell, capture)
101+
102+
except Exception as e:
103+
print(f"ERROR: Tracking failed: {e}")
104+
raise
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from .io_capture import IOCapture
2+
from .provenance_tracker import ProvenanceTracker
3+
from .metadata_generator import (
4+
MetadataGenerator,
5+
GeminiMetadataGenerator,
6+
FallbackMetadataGenerator,
7+
MockMetadataGenerator,
8+
create_metadata_generator
9+
)
10+
from .config import TrackerConfig, ProvenanceConfig, TrackingResult
11+
from .utils import (
12+
normalize_path,
13+
is_trackable_path,
14+
read_dataset_sample,
15+
collect_dataset_samples,
16+
format_samples_for_prompt
17+
)
18+
19+
__all__ = [
20+
'IOCapture',
21+
'ProvenanceTracker',
22+
'MetadataGenerator',
23+
'GeminiMetadataGenerator',
24+
'FallbackMetadataGenerator',
25+
'MockMetadataGenerator',
26+
'create_metadata_generator',
27+
'TrackerConfig',
28+
'ProvenanceConfig',
29+
'TrackingResult',
30+
'normalize_path',
31+
'is_trackable_path',
32+
'read_dataset_sample',
33+
'collect_dataset_samples',
34+
'format_samples_for_prompt',
35+
]
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from dataclasses import dataclass, field
2+
from typing import List, Optional
3+
from pathlib import Path
4+
5+
6+
@dataclass
7+
class TrackerConfig:
8+
track_builtins: bool = True
9+
track_pathlib: bool = True
10+
track_pandas: bool = True
11+
track_numpy: bool = True
12+
excluded_patterns: List[str] = field(default_factory=lambda: [
13+
'.matplotlib',
14+
'.ipython',
15+
'.jupyter',
16+
'site-packages',
17+
'/tmp/',
18+
'__pycache__'
19+
])
20+
21+
22+
@dataclass
23+
class TrackingResult:
24+
computation_guid: str
25+
software_guid: str
26+
input_count: int
27+
output_count: int
28+
reused_count: int
29+
new_datasets: int
30+
31+
def __str__(self):
32+
return (
33+
f"Tracked computation: {self.computation_guid}\n"
34+
f" Software: {self.software_guid}\n"
35+
f" Inputs: {self.input_count} datasets ({self.reused_count} reused)\n"
36+
f" Outputs: {self.output_count} datasets"
37+
)
38+
39+
40+
@dataclass
41+
class ProvenanceConfig:
42+
rocrate_path: Path
43+
author: str = "Unknown"
44+
keywords: List[str] = field(default_factory=lambda: ["jupyter", "computation"])
45+
manual_inputs: List[str] = field(default_factory=list)
46+
use_llm: bool = False

0 commit comments

Comments
 (0)