linkml · amc-corey-cox · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ The main effort of this project is to use existing [LinkML](https://linkml.io/li
    - Within the `dm_bip` directory, there are core python files:
      - `main.py` - Core entry point
      - `cli.py` for [`click`](https://click.palletsprojects.com) commands.
-   - Plus subdirectories for pipeline functionality: `cleaners/`, `format_converter/`, `map_data/`
+   - Plus subdirectories for pipeline functionality: `cleaners/`, `format_converter/`, `map_data/`, `trans_spec_gen/`
  - `tests` directory with a very basic test.
  - `uv` compatible `pyproject.toml` file containing minimal package requirements.
  - `tox.ini` file containing configuration for:

diff --git a/src/dm_bip/cli.py b/src/dm_bip/cli.py
@@ -1,6 +1,7 @@
 """Command line interface for dm-bip."""
 
 import logging
+from pathlib import Path
 from typing import Annotated, Optional
 
 import typer
@@ -50,5 +51,32 @@ def run():
     typer.echo("Run 'make help' to see available targets and usage information.")
 
 
+@app.command()
+def generate_trans_specs(
+    input_csv: Annotated[Path, typer.Option("--input", "-i", help="Path to the metadata CSV")],
+    output_dir: Annotated[Path, typer.Option("--output", "-o", help="Directory for YAML output files")],
+    cohort: Annotated[str, typer.Option("--cohort", "-c", help="Cohort to filter on (e.g. aric, jhs, whi)")],
+    entity: Annotated[str, typer.Option("--entity", "-e", help="Entity type to filter on")] = "MeasurementObservation",
+    template: Annotated[str, typer.Option("--template", "-t", help="Jinja2 template filename")] = "yaml_measobs.j2",
+):
+    """Generate trans-spec YAML files from a metadata CSV."""
+    from dm_bip.trans_spec_gen.generate_trans_specs import generate_yaml
+
+    results = generate_yaml(
+        input_csv=input_csv,
+        output_dir=output_dir,
+        entity=entity,
+        cohort=cohort,
+        template_name=template,
+    )
+    if results:
+        typer.echo(f"Generated {len(results)} YAML files in {output_dir}")
+        for path in results:
+            typer.echo(f"  {path}")
+    else:
+        typer.echo(f"No matching rows for entity={entity}, cohort={cohort}")
+        raise typer.Exit(code=1)
+
+
 if __name__ == "__main__":
     app()
diff --git a/src/dm_bip/make_yaml/README.md b/src/dm_bip/make_yaml/README.md
diff --git a/src/dm_bip/trans_spec_gen/README.md b/src/dm_bip/trans_spec_gen/README.md
@@ -0,0 +1,98 @@
+# Trans-Spec Generator
+
+Generate LinkML-Map transformation specification YAML files from a metadata CSV.
+
+## Background
+
+Transformation specifications (trans-specs) define how source data maps to the
+[BDCHM](https://github.com/biomedical-data-models/bdchm) harmonized model. Each
+trans-spec is a YAML file that tells [linkml-map](https://github.com/linkml/linkml-map)
+how to derive harmonized slots (participant, visit, age, observation type, quantity)
+from dbGaP study tables.
+
+This tool automates trans-spec generation from a metadata CSV that catalogs the
+mapping between source variables (phv IDs) and harmonized variables. It was
+originally developed as a Stata/Python workflow in
+[RTIInternational/NHLBI-BDC-DMC-HV](https://github.com/RTIInternational/NHLBI-BDC-DMC-HV)
+and has been refactored for use within dm-bip.
+
+## Usage
+
+```bash
+uv run dm-bip generate-trans-specs \
+  --input shortdata.csv \
+  --output ./output \
+  --cohort aric
+```
+
+### Options
+
+| Option | Short | Required | Default | Description |
+|--------|-------|----------|---------|-------------|
+| `--input` | `-i` | Yes | | Path to the metadata CSV |
+| `--output` | `-o` | Yes | | Directory for YAML output files |
+| `--cohort` | `-c` | Yes | | Cohort to generate specs for (e.g. aric, jhs, whi, cardia, fhs) |
+| `--entity` | `-e` | No | MeasurementObservation | Entity type to filter on |
+| `--template` | `-t` | No | yaml_measobs.j2 | Jinja2 template filename |
+
+### Output structure
+
+```
+output/
+└── {cohort}/
+    ├── good/          # Rows where row_good == 1 (ready for use)
+    │   ├── albumin_bld.yaml
+    │   ├── bdy_hgt.yaml
+    │   └── ...
+    └── bad/           # Rows where row_good != 1 (need curator review)
+        └── ...
+```
+
+## Input CSV format
+
+The metadata CSV must contain these columns:
+
+| Column | Description |
+|--------|-------------|
+| `bdchm_entity` | Entity type (e.g. "MeasurementObservation") |
+| `cohort` | Study cohort (e.g. "aric", "jhs") |
+| `bdchm_varname` | Harmonized variable name |
+| `row_good` | 1 = ready for use, 0 = needs review |
+| `pht` | dbGaP study table ID |
+| `participantidphv` | PHV for participant ID |
+| `phv` | PHV for the measurement value |
+| `onto_id` | Ontology ID (e.g. LOINC code) |
+| `bdchm_unit` | Target unit |
+| `has_visit` | 1 = has direct visit value |
+| `has_visit_expr` | 1 = visit derived via expression |
+| `associatedvisit` | Visit label (when has_visit=1) |
+| `associatedvisit_expr` | Visit expression input (when has_visit_expr=1) |
+| `has_age` | 1 = has age data |
+| `ageinyearsphv` | PHV for age in years |
+| `unit_match` | 1 = source unit matches target |
+| `unit_convert` | 1 = needs unit conversion |
+| `unit_expr` | 1 = needs expression-based conversion |
+| `unit_casestmt` | 1 = needs case-statement conversion |
+| `source_unit` | Source unit (when unit_convert=1) |
+| `target_unit` | Target unit (when unit_convert=1) |
+| `conversion_rule` | Math expression (when unit_expr=1) |
+| `unit_casestmt_custom` | Case expression (when unit_casestmt=1) |
+
+Exactly one of `unit_match`, `unit_convert`, `unit_expr`, `unit_casestmt` should
+be 1 for each row.
+
+## Jinja2 template
+
+The template (`templates/yaml_measobs.j2`) defines the YAML structure for
+MeasurementObservation trans-specs. It handles four unit-handling branches
+and conditional visit/age fields. Custom templates can be provided via the
+`--template` option.
+
+## Testing
+
+```bash
+uv run pytest tests/unit/test_generate_trans_specs.py -v
+```
+
+Tests use a synthetic 8-row CSV (`tests/input/make_yaml/shortdata_sample.csv`)
+that covers all template branches without requiring real study data.
diff --git a/src/dm_bip/trans_spec_gen/generate_trans_specs.py b/src/dm_bip/trans_spec_gen/generate_trans_specs.py
@@ -0,0 +1,64 @@
+"""
+Generate YAML transformation specs from metadata CSV using Jinja2 templates.
+
+Refactored from DMCYAML_07_GenerateYAML_forPy.py (RTIInternational/NHLBI-BDC-DMC-HV).
+"""
+
+from pathlib import Path
+
+import pandas as pd
+from jinja2 import Environment, FileSystemLoader
+
+TEMPLATES_DIR = Path(__file__).parent / "templates"
+
+
+def generate_yaml(
+    input_csv: Path,
+    output_dir: Path,
+    entity: str,
+    cohort: str,
+    template_name: str = "yaml_measobs.j2",
+    templates_dir: Path = TEMPLATES_DIR,
+) -> list[Path]:
+    """
+    Generate YAML files from a metadata CSV for a given entity and cohort.
+
+    Args:
+        input_csv: Path to the metadata CSV.
+        output_dir: Directory to write YAML output files.
+        entity: Entity type to filter on (e.g. "MeasurementObservation").
+        cohort: Cohort to filter on (e.g. "aric").
+        template_name: Jinja2 template filename.
+        templates_dir: Directory containing Jinja2 templates.
+
+    Returns:
+        List of paths to generated YAML files.
+
+    """
+    df = pd.read_csv(input_csv)
+    df = df.fillna(0)
+
+    df_filtered = df[(df["bdchm_entity"] == entity) & (df["cohort"] == cohort)].copy()
+    if df_filtered.empty:
+        return []
+
+    env = Environment(loader=FileSystemLoader(str(templates_dir)), trim_blocks=True, lstrip_blocks=True)  # noqa: S701 - generating YAML, not HTML
+    template = env.get_template(template_name)
+
+    written = []
+    for quality in ("good", "bad"):
+        if quality == "good":
+            subset = df_filtered[df_filtered["row_good"] == 1]
+        else:
+            subset = df_filtered[df_filtered["row_good"] != 1]
+
+        for varname, group in subset.groupby("bdchm_varname"):
+            safe_name = Path(varname).name
+            out_path = output_dir / cohort / quality / f"{safe_name}.yaml"
+            out_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(out_path, "w") as f:
+                for _, row in group.iterrows():
+                    f.write(template.render(**row.to_dict()))
+            written.append(out_path)
+
+    return written
diff --git a/src/dm_bip/trans_spec_gen/templates/yaml_measobs.j2 b/src/dm_bip/trans_spec_gen/templates/yaml_measobs.j2
@@ -0,0 +1,49 @@
+- class_derivations:
+    MeasurementObservation:
+      populated_from: {{pht}}
+      slot_derivations:
+        associated_participant:
+          populated_from: {{participantidphv}}
+        {% if has_visit == 1 %}
+        associated_visit:
+          value: {{associatedvisit}}
+        {% elif has_visit_expr == 1 %}
+        associated_visit:
+          expr: 'uuid5("https://w3id.org/bdchm/Visit", str({{ '{' }}{{participantidphv}}{{ '}' }}) + ":" + {{associatedvisit_expr}})'
+        {% endif %}
+      {% if has_age == 1 %}
+        age_at_observation:
+          expr: '{{ '{' }}{{ageinyearsphv}}{{ '}' }} * 365'
+      {% endif %}
+        observation_type:
+          value: {{onto_id}}
+        value_quantity:
+          object_derivations:
+          - class_derivations:
+              Quantity:
+                populated_from: {{pht}}
+                slot_derivations:
+                  value_decimal:
+                  {% if unit_match == 1 %}
+                    populated_from: {{phv}}
+                  unit:
+                    value: '{{bdchm_unit}}'
+                  {% elif unit_convert == 1 %}
+                    populated_from: {{phv}}
+                    unit_conversion:
+                      source_unit: '{{source_unit}}'
+                      target_unit: '{{target_unit}}'
+                  unit:
+                    value: '{{bdchm_unit}}'
+                    range: string
+                  {% elif unit_expr == 1 %}
+                    expr: '{{ '{' }}{{phv}}{{ '}' }} {{conversion_rule}}'
+                  unit:
+                    value: '{{bdchm_unit}}'
+                    range: string
+                  {% elif unit_casestmt == 1 %}
+                    expr: '{{unit_casestmt_custom}}'
+                  unit:
+                    value: '{{bdchm_unit}}'
+                    range: string
+                  {% endif %}
diff --git a/tests/input/make_yaml/shortdata_sample.csv b/tests/input/make_yaml/shortdata_sample.csv
@@ -0,0 +1,9 @@
+bdchm_entity,cohort,bdchm_varname,row_good,pht,participantidphv,associatedvisit,has_visit,has_visit_expr,associatedvisit_expr,ageinyearsphv,has_age,onto_id,phv,bdchm_unit,unit_match,unit_convert,unit_expr,unit_casestmt,source_unit,target_unit,conversion_rule,unit_casestmt_custom
+MeasurementObservation,aric,albumin_bld,1,pht000001,phv00000101,Visit_1,1,0,,phv00000102,1,LOINC:1751-7,phv00000103,g/dL,1,0,0,0,,,,
+MeasurementObservation,aric,bdy_hgt,1,pht000002,phv00000201,Visit_1,1,0,,phv00000202,1,LOINC:8302-2,phv00000203,cm,0,1,0,0,in,cm,,
+MeasurementObservation,aric,bdy_wgt,1,pht000003,phv00000301,Visit_1,1,0,,phv00000302,1,LOINC:29463-7,phv00000303,kg,0,0,1,0,,,* 0.453592,
+MeasurementObservation,aric,waist_circ,1,pht000006,phv00000601,Visit_1,1,0,,phv00000602,1,LOINC:56086-2,phv00000603,cm,0,0,0,1,,,,case({phv00000603} == 1; {phv00000603} * 2.54)
+MeasurementObservation,aric,bp_systolic,1,pht000007,phv00000701,,0,1,Visit_1_label,phv00000702,1,LOINC:8480-6,phv00000703,mmHg,1,0,0,0,,,,
+MeasurementObservation,aric,glucose_bld,1,pht000008,phv00000801,Visit_1,1,0,,,0,LOINC:2345-7,phv00000803,mg/dL,1,0,0,0,,,,
+MeasurementObservation,aric,hemo_a1c,0,pht000004,phv00000401,Visit_1,1,0,,phv00000402,1,LOINC:4548-4,phv00000403,%,1,0,0,0,,,,
+MeasurementObservation,jhs,bp_systolic,1,pht000005,phv00000501,Visit_1,1,0,,phv00000502,1,LOINC:8480-6,phv00000503,mmHg,1,0,0,0,,,,