broadinstitute · bpblanken · May 30, 2025 · May 22, 2025 · May 22, 2025 · May 22, 2025
@@ -101,20 +101,6 @@ def entries_fields(
             },
         }[self]
 
-    @property
-    def entries_export_fields(
-        self,
-    ) -> Callable[[hl.StructExpression], hl.StructExpression]:
-        return {
-            DatasetType.SNV_INDEL: lambda fe: hl.Struct(
-                sampleId=fe.s,
-                gt=fe.GT.n_alt_alleles(),
-                gq=fe.GQ,
-                ab=fe.AB,
-                dp=fe.DP,
-            ),
-        }[self]
-
     @property
     def row_fields(
         self,

@@ -1,7 +1,60 @@
 import hail as hl
 
 from v03_pipeline.lib.annotations.expression_helpers import get_expr_for_xpos
-from v03_pipeline.lib.model import DatasetType, SampleType
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.tasks.exports.misc import array_structexpression_fields
+
+
+def reference_independent_contig(locus: hl.LocusExpression):
+    return locus.contig.replace('^chr', '').replace('MT', 'M')
+
+
+def get_dataset_type_specific_annotations(
+    ht: hl.Table,
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+):
+    return {
+        DatasetType.SNV_INDEL: lambda ht: {
+            'hgmd': (
+                ht.hgmd
+                if hasattr(ht, 'hgmd')
+                else hl.missing(hl.tstruct(accession=hl.tstr, classification=hl.tstr))
+            ),
+            **(
+                {'screenRegionType': ht.screen.region_types.first()}
+                if reference_genome == ReferenceGenome.GRCh38
+                else {}
+            ),
+        },
+        DatasetType.MITO: lambda ht: {
+            'commonLowHeteroplasmy': ht.common_low_heteroplasmy,
+            'mitomapPathogenic': ht.mitomap.pathogenic,
+        },
+    }[dataset_type](ht)
+
+
+def get_calls_export_fields(
+    fe: hl.Struct,
+    dataset_type: DatasetType,
+):
+    return {
+        DatasetType.SNV_INDEL: lambda fe: hl.Struct(
+            sampleId=fe.s,
+            gt=fe.GT.n_alt_alleles(),
+            gq=fe.GQ,
+            ab=fe.AB,
+            dp=fe.DP,
+        ),
+        DatasetType.MITO: lambda fe: hl.Struct(
+            sampleId=fe.s,
+            gt=fe.GT.n_alt_alleles(),
+            dp=fe.DP,
+            hl=fe.HL,
+            mitoCn=fe.mito_cn,
+            contamination=fe.contamination,
+        ),
+    }[dataset_type](fe)
 
 
 def get_entries_export_fields(
@@ -25,7 +78,152 @@ def get_entries_export_fields(
         ),
         'filters': ht.filters,
         'calls': ht.family_entries.map(
-            lambda fe: dataset_type.entries_export_fields(fe),
+            lambda fe: get_calls_export_fields(fe, dataset_type),
         ),
         'sign': 1,
     }
+
+
+def get_predictions_export_fields(
+    ht: hl.Table,
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+):
+    return {
+        DatasetType.SNV_INDEL: lambda ht: {
+            'cadd': ht.dbnsfp.CADD_phred,
+            'eigen': ht.eigen.Eigen_phred,
+            'fathmm': ht.dbnsfp.fathmm_MKL_coding_score,
+            **(
+                {
+                    'gnomad_noncoding': ht.gnomad_non_coding_constraint.z_score,
+                }
+                if reference_genome == ReferenceGenome.GRCh38
+                else {}
+            ),
+            'mpc': ht.dbnsfp.MPC_score,
+            'mut_pred': ht.dbnsfp.MutPred_score,
+            'mut_tester': ht.dbnsfp.MutationTaster_pred,
+            'polyphen': ht.dbnsfp.Polyphen2_HVAR_score,
+            'primate_ai': ht.dbnsfp.PrimateAI_score,
+            'revel': ht.dbnsfp.REVEL_score,
+            'sift': ht.dbnsfp.SIFT_score,
+            'splice_ai': ht.splice_ai.delta_score,
+            'splice_ai_consequence': ht.splice_ai.splice_consequence,
+            'vest': ht.dbnsfp.VEST4_score,
+        },
+        DatasetType.MITO: lambda ht: {
+            'apogee': ht.mitimpact.score,
+            'haplogroup_defining': hl.or_missing(ht.haplogroup.is_defining, 'Y'),
+            'hmtvar': ht.hmtvar.score,
+            'mitotip': ht.mitotip.trna_prediction,
+            'mut_taster': ht.dbnsfp.MutationTaster_pred,
+            'sift': ht.dbnsfp.SIFT_score,
+            'mlc': ht.local_constraint_mito.score,
+        },
+    }[dataset_type](ht)
+
+
+def get_populations_export_fields(ht: hl.Table, dataset_type: DatasetType):
+    return {
+        DatasetType.SNV_INDEL: lambda ht: {
+            'exac': hl.Struct(
+                ac=ht.exac.AC_Adj,
+                af=ht.exac.AF,
+                an=ht.exac.AN_Adj,
+                filter_af=ht.exac.AF_POPMAX,
+                hemi=ht.exac.AC_Hemi,
+                het=ht.exac.AC_Het,
+                hom=ht.exac.AC_Hom,
+            ),
+            'gnomad_exomes': hl.Struct(
+                ac=ht.gnomad_exomes.AC,
+                af=ht.gnomad_exomes.AF,
+                an=ht.gnomad_exomes.AN,
+                filter_af=ht.gnomad_exomes.AF_POPMAX_OR_GLOBAL,
+                hemi=ht.gnomad_exomes.Hemi,
+                hom=ht.gnomad_exomes.Hom,
+            ),
+            'gnomad_genomes': hl.Struct(
+                ac=ht.gnomad_genomes.AC,
+                af=ht.gnomad_genomes.AF,
+                an=ht.gnomad_genomes.AN,
+                filter_af=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL,
+                hemi=ht.gnomad_genomes.Hemi,
+                hom=ht.gnomad_genomes.Hom,
+            ),
+            'topmed': hl.Struct(
+                ac=ht.topmed.AC,
+                af=ht.topmed.AF,
+                an=ht.topmed.AN,
+                het=ht.topmed.Het,
+                hom=ht.topmed.Hom,
+            ),
+        },
+        DatasetType.MITO: lambda ht: {
+            'gnomad_mito': hl.Struct(
+                ac=ht.gnomad_mito.AC_hom,
+                af=ht.gnomad_mito.AF_hom,
+                an=ht.gnomad_mito.AN,
+            ),
+            'gnomad_mito_heteroplasmy': hl.Struct(
+                ac=ht.gnomad_mito.AC_het,
+                af=ht.gnomad_mito.AF_hom,
+                an=ht.gnomad_mito.AN,
+                max_hl=ht.gnomad_mito.max_hl,
+            ),
+            'helix': hl.Struct(
+                ac=ht.helix_mito.AC_hom,
+                af=ht.helix_mito.AF_hom,
+                an=ht.helix_mito.AN,
+            ),
+            'helix_heteroplasmy': hl.Struct(
+                ac=ht.helix_mito.AC_het,
+                af=ht.helix_mito.AF_het,
+                an=ht.helix_mito.AN,
+                max_hl=ht.helix_mito.max_hl,
+            ),
+        },
+    }[dataset_type](ht)
+
+
+def get_variants_export_fields(
+    ht: hl.Table,
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+):
+    return {
+        'key_': ht.key_,
+        'xpos': ht.xpos,
+        'chrom': reference_independent_contig(ht.locus),
+        'pos': ht.locus.position,
+        'ref': ht.alleles[0],
+        'alt': ht.alleles[1],
+        'variantId': ht.variant_id,
+        'rsid': ht.rsid,
+        **(
+            {
+                'CAID': ht.CAID,
+            }
+            if hasattr(ht, 'CAID')
+            else {}
+        ),
+        'liftedOverChrom': (
+            reference_independent_contig(ht.rg37_locus)
+            if hasattr(ht, 'rg37_locus')
+            else reference_independent_contig(ht.rg38_locus)
+        ),
+        'liftedOverPos': (
+            ht.rg37_locus.position
+            if hasattr(ht, 'rg37_locus')
+            else ht.rg38_locus.position
+        ),
+        **get_dataset_type_specific_annotations(ht, reference_genome, dataset_type),
+        'predictions': hl.Struct(
+            **get_predictions_export_fields(ht, reference_genome, dataset_type),
+        ),
+        'populations': hl.Struct(
+            **get_populations_export_fields(ht, dataset_type),
+        ),
+        **{f: ht[f] for f in sorted(array_structexpression_fields(ht))},
+    }
@@ -84,6 +84,7 @@ def subset_filterable_transcripts_fields(
 def camelcase_array_structexpression_fields(
     ht: hl.Table,
     reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
 ):
     for field in array_structexpression_fields(ht):
         ht = ht.transmute(
@@ -94,12 +95,12 @@ def camelcase_array_structexpression_fields(
             },
         )
 
-    # Custom handling of nested sorted_transcript_consequences fields for GRCh38
+    # Custom handling of nested sorted_transcript_consequences fields for GRCh38/SNV_INDEL.
     # Note that spliceregion (extended_intronic_splice_region_variant) prevents
     # a more procedural approach here.
     if (
         reference_genome == ReferenceGenome.GRCh38
-        and 'sortedTranscriptConsequences' in ht.row
+        and dataset_type == DatasetType.SNV_INDEL
     ):
         ht = ht.annotate(
             sortedTranscriptConsequences=ht.sortedTranscriptConsequences.map(

@@ -428,7 +428,11 @@ def test_camelcase_array_structexpression_fields(self) -> None:
             ReferenceGenome.GRCh38,
             DatasetType.SNV_INDEL,
         )
-        ht = camelcase_array_structexpression_fields(ht, ReferenceGenome.GRCh38)
+        ht = camelcase_array_structexpression_fields(
+            ht,
+            ReferenceGenome.GRCh38,
+            DatasetType.SNV_INDEL,
+        )
         ht = ht.annotate(
             sortedTranscriptConsequences=[ht.sortedTranscriptConsequences[0]],
         )

@@ -1,3 +1,6 @@
+from unittest import mock
+from unittest.mock import Mock
+
 import hail as hl
 import luigi.worker
 import pandas as pd
@@ -15,11 +18,13 @@
     WriteNewClinvarVariantsParquetTask,
 )
 from v03_pipeline.lib.test.misc import convert_ndarray_to_list
+from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 
 TEST_SNV_INDEL_ANNOTATIONS = (
     'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht'
 )
+TEST_MITO_ANNOTATIONS = 'v03_pipeline/var/test/exports/GRCh38/MITO/annotations.ht'
 
 TEST_RUN_ID = 'manual__2024-04-03'
 
@@ -37,6 +42,16 @@ def setUp(self) -> None:
                 TEST_RUN_ID,
             ),
         )
+        ht = hl.read_table(
+            TEST_MITO_ANNOTATIONS,
+        )
+        ht.write(
+            new_variants_table_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.MITO,
+                TEST_RUN_ID,
+            ),
+        )
 
     def test_write_new_clinvar_variants_parquet_test(
         self,
@@ -81,3 +96,54 @@ def test_write_new_clinvar_variants_parquet_test(
                 },
             ],
         )
+
+    @mock.patch(
+        'v03_pipeline.lib.tasks.exports.write_new_clinvar_variants_parquet.WriteNewVariantsTableTask',
+    )
+    def test_write_new_mito_clinvar_variants_parquet_test(
+        self,
+        write_new_variants_table_task: Mock,
+    ) -> None:
+        write_new_variants_table_task.return_value = MockCompleteTask()
+        worker = luigi.worker.Worker()
+        task = WriteNewClinvarVariantsParquetTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.MITO,
+            sample_type=SampleType.WGS,
+            callset_path='fake_callset',
+            project_guids=[
+                'fake_project',
+            ],
+            project_pedigree_paths=['fake_pedigree'],
+            skip_validation=True,
+            run_id=TEST_RUN_ID,
+        )
+        worker.add(task)
+        worker.run()
+        self.assertTrue(task.output().exists())
+        self.assertTrue(task.complete())
+        df = pd.read_parquet(
+            new_clinvar_variants_parquet_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.MITO,
+                TEST_RUN_ID,
+            ),
+        )
+        export_json = convert_ndarray_to_list(df.head(1).to_dict('records'))
+        self.assertEqual(
+            export_json,
+            [
+                {
+                    'key': 998,
+                    'alleleId': 677824,
+                    'conflictingPathogenicities': None,
+                    'goldStars': 1,
+                    'submitters': [
+                        'Wong Mito Lab, Molecular and Human Genetics, Baylor College of Medicine',
+                    ],
+                    'conditions': ['MELAS syndrome'],
+                    'assertions': [],
+                    'pathogenicity': 'Likely_pathogenic',
+                },
+            ],
+        )
@@ -9,6 +9,7 @@
 )
 from v03_pipeline.lib.paths import (
     new_entries_parquet_path,
+    variant_annotations_table_path,
 )
 from v03_pipeline.lib.reference_datasets.reference_dataset import (
     BaseReferenceDataset,
@@ -46,10 +47,10 @@ def output(self) -> luigi.Target:
             ),
         )
 
-    def requires(self) -> list[luigi.Task]:
+    def requires(self) -> dict[str, luigi.Task]:
         return {
-            ANNOTATIONS_TABLE_TASK: (
-                self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask)
+            ANNOTATIONS_TABLE_TASK: self.clone(
+                UpdateVariantAnnotationsTableWithNewSamplesTask,
             ),
             REMAPPED_AND_SUBSETTED_CALLSET_TASKS: [
                 self.clone(
@@ -93,7 +94,10 @@ def create_table(self) -> None:
             )
             ht = deglobalize_ids(ht)
             annotations_ht = hl.read_table(
-                self.input()[ANNOTATIONS_TABLE_TASK].path,
+                variant_annotations_table_path(
+                    self.reference_genome,
+                    self.dataset_type,
+                ),
             )
             ht = ht.join(annotations_ht)