Skip to content

feat: implement mito #1105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
May 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 0 additions & 14 deletions v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,20 +101,6 @@ def entries_fields(
},
}[self]

@property
def entries_export_fields(
self,
) -> Callable[[hl.StructExpression], hl.StructExpression]:
return {
DatasetType.SNV_INDEL: lambda fe: hl.Struct(
sampleId=fe.s,
gt=fe.GT.n_alt_alleles(),
gq=fe.GQ,
ab=fe.AB,
dp=fe.DP,
),
}[self]

@property
def row_fields(
self,
Expand Down
202 changes: 200 additions & 2 deletions v03_pipeline/lib/tasks/exports/fields.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,60 @@
import hail as hl

from v03_pipeline.lib.annotations.expression_helpers import get_expr_for_xpos
from v03_pipeline.lib.model import DatasetType, SampleType
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
from v03_pipeline.lib.tasks.exports.misc import array_structexpression_fields


def reference_independent_contig(locus: hl.LocusExpression):
return locus.contig.replace('^chr', '').replace('MT', 'M')


def get_dataset_type_specific_annotations(
ht: hl.Table,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
):
return {
DatasetType.SNV_INDEL: lambda ht: {
'hgmd': (
ht.hgmd
if hasattr(ht, 'hgmd')
else hl.missing(hl.tstruct(accession=hl.tstr, classification=hl.tstr))
),
**(
{'screenRegionType': ht.screen.region_types.first()}
if reference_genome == ReferenceGenome.GRCh38
else {}
),
},
DatasetType.MITO: lambda ht: {
'commonLowHeteroplasmy': ht.common_low_heteroplasmy,
'mitomapPathogenic': ht.mitomap.pathogenic,
},
}[dataset_type](ht)


def get_calls_export_fields(
fe: hl.Struct,
dataset_type: DatasetType,
):
return {
DatasetType.SNV_INDEL: lambda fe: hl.Struct(
sampleId=fe.s,
gt=fe.GT.n_alt_alleles(),
gq=fe.GQ,
ab=fe.AB,
dp=fe.DP,
),
DatasetType.MITO: lambda fe: hl.Struct(
sampleId=fe.s,
gt=fe.GT.n_alt_alleles(),
dp=fe.DP,
hl=fe.HL,
mitoCn=fe.mito_cn,
contamination=fe.contamination,
),
}[dataset_type](fe)


def get_entries_export_fields(
Expand All @@ -25,7 +78,152 @@ def get_entries_export_fields(
),
'filters': ht.filters,
'calls': ht.family_entries.map(
lambda fe: dataset_type.entries_export_fields(fe),
lambda fe: get_calls_export_fields(fe, dataset_type),
),
'sign': 1,
}


def get_predictions_export_fields(
ht: hl.Table,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
):
return {
DatasetType.SNV_INDEL: lambda ht: {
'cadd': ht.dbnsfp.CADD_phred,
'eigen': ht.eigen.Eigen_phred,
'fathmm': ht.dbnsfp.fathmm_MKL_coding_score,
**(
{
'gnomad_noncoding': ht.gnomad_non_coding_constraint.z_score,
}
if reference_genome == ReferenceGenome.GRCh38
else {}
),
'mpc': ht.dbnsfp.MPC_score,
'mut_pred': ht.dbnsfp.MutPred_score,
'mut_tester': ht.dbnsfp.MutationTaster_pred,
'polyphen': ht.dbnsfp.Polyphen2_HVAR_score,
'primate_ai': ht.dbnsfp.PrimateAI_score,
'revel': ht.dbnsfp.REVEL_score,
'sift': ht.dbnsfp.SIFT_score,
'splice_ai': ht.splice_ai.delta_score,
'splice_ai_consequence': ht.splice_ai.splice_consequence,
'vest': ht.dbnsfp.VEST4_score,
},
DatasetType.MITO: lambda ht: {
'apogee': ht.mitimpact.score,
'haplogroup_defining': hl.or_missing(ht.haplogroup.is_defining, 'Y'),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small note that I ported the hl.or_missing logic from hail-search.

'hmtvar': ht.hmtvar.score,
'mitotip': ht.mitotip.trna_prediction,
'mut_taster': ht.dbnsfp.MutationTaster_pred,
'sift': ht.dbnsfp.SIFT_score,
'mlc': ht.local_constraint_mito.score,
},
}[dataset_type](ht)


def get_populations_export_fields(ht: hl.Table, dataset_type: DatasetType):
return {
DatasetType.SNV_INDEL: lambda ht: {
'exac': hl.Struct(
ac=ht.exac.AC_Adj,
af=ht.exac.AF,
an=ht.exac.AN_Adj,
filter_af=ht.exac.AF_POPMAX,
hemi=ht.exac.AC_Hemi,
het=ht.exac.AC_Het,
hom=ht.exac.AC_Hom,
),
'gnomad_exomes': hl.Struct(
ac=ht.gnomad_exomes.AC,
af=ht.gnomad_exomes.AF,
an=ht.gnomad_exomes.AN,
filter_af=ht.gnomad_exomes.AF_POPMAX_OR_GLOBAL,
hemi=ht.gnomad_exomes.Hemi,
hom=ht.gnomad_exomes.Hom,
),
'gnomad_genomes': hl.Struct(
ac=ht.gnomad_genomes.AC,
af=ht.gnomad_genomes.AF,
an=ht.gnomad_genomes.AN,
filter_af=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL,
hemi=ht.gnomad_genomes.Hemi,
hom=ht.gnomad_genomes.Hom,
),
'topmed': hl.Struct(
ac=ht.topmed.AC,
af=ht.topmed.AF,
an=ht.topmed.AN,
het=ht.topmed.Het,
hom=ht.topmed.Hom,
),
},
DatasetType.MITO: lambda ht: {
'gnomad_mito': hl.Struct(
ac=ht.gnomad_mito.AC_hom,
af=ht.gnomad_mito.AF_hom,
an=ht.gnomad_mito.AN,
),
'gnomad_mito_heteroplasmy': hl.Struct(
ac=ht.gnomad_mito.AC_het,
af=ht.gnomad_mito.AF_hom,
an=ht.gnomad_mito.AN,
max_hl=ht.gnomad_mito.max_hl,
),
'helix': hl.Struct(
ac=ht.helix_mito.AC_hom,
af=ht.helix_mito.AF_hom,
an=ht.helix_mito.AN,
),
'helix_heteroplasmy': hl.Struct(
ac=ht.helix_mito.AC_het,
af=ht.helix_mito.AF_het,
an=ht.helix_mito.AN,
max_hl=ht.helix_mito.max_hl,
),
},
}[dataset_type](ht)


def get_variants_export_fields(
ht: hl.Table,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
):
return {
'key_': ht.key_,
'xpos': ht.xpos,
'chrom': reference_independent_contig(ht.locus),
'pos': ht.locus.position,
'ref': ht.alleles[0],
'alt': ht.alleles[1],
'variantId': ht.variant_id,
'rsid': ht.rsid,
**(
{
'CAID': ht.CAID,
}
if hasattr(ht, 'CAID')
else {}
),
'liftedOverChrom': (
reference_independent_contig(ht.rg37_locus)
if hasattr(ht, 'rg37_locus')
else reference_independent_contig(ht.rg38_locus)
),
'liftedOverPos': (
ht.rg37_locus.position
if hasattr(ht, 'rg37_locus')
else ht.rg38_locus.position
),
**get_dataset_type_specific_annotations(ht, reference_genome, dataset_type),
'predictions': hl.Struct(
**get_predictions_export_fields(ht, reference_genome, dataset_type),
),
'populations': hl.Struct(
**get_populations_export_fields(ht, dataset_type),
),
**{f: ht[f] for f in sorted(array_structexpression_fields(ht))},
}
5 changes: 3 additions & 2 deletions v03_pipeline/lib/tasks/exports/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def subset_filterable_transcripts_fields(
def camelcase_array_structexpression_fields(
ht: hl.Table,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
):
for field in array_structexpression_fields(ht):
ht = ht.transmute(
Expand All @@ -94,12 +95,12 @@ def camelcase_array_structexpression_fields(
},
)

# Custom handling of nested sorted_transcript_consequences fields for GRCh38
# Custom handling of nested sorted_transcript_consequences fields for GRCh38/SNV_INDEL.
# Note that spliceregion (extended_intronic_splice_region_variant) prevents
# a more procedural approach here.
if (
reference_genome == ReferenceGenome.GRCh38
and 'sortedTranscriptConsequences' in ht.row
and dataset_type == DatasetType.SNV_INDEL
):
ht = ht.annotate(
sortedTranscriptConsequences=ht.sortedTranscriptConsequences.map(
Expand Down
6 changes: 5 additions & 1 deletion v03_pipeline/lib/tasks/exports/misc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,11 @@ def test_camelcase_array_structexpression_fields(self) -> None:
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
)
ht = camelcase_array_structexpression_fields(ht, ReferenceGenome.GRCh38)
ht = camelcase_array_structexpression_fields(
ht,
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
)
ht = ht.annotate(
sortedTranscriptConsequences=[ht.sortedTranscriptConsequences[0]],
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from unittest import mock
from unittest.mock import Mock

import hail as hl
import luigi.worker
import pandas as pd
Expand All @@ -15,11 +18,13 @@
WriteNewClinvarVariantsParquetTask,
)
from v03_pipeline.lib.test.misc import convert_ndarray_to_list
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase

TEST_SNV_INDEL_ANNOTATIONS = (
'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht'
)
TEST_MITO_ANNOTATIONS = 'v03_pipeline/var/test/exports/GRCh38/MITO/annotations.ht'

TEST_RUN_ID = 'manual__2024-04-03'

Expand All @@ -37,6 +42,16 @@ def setUp(self) -> None:
TEST_RUN_ID,
),
)
ht = hl.read_table(
TEST_MITO_ANNOTATIONS,
)
ht.write(
new_variants_table_path(
ReferenceGenome.GRCh38,
DatasetType.MITO,
TEST_RUN_ID,
),
)

def test_write_new_clinvar_variants_parquet_test(
self,
Expand Down Expand Up @@ -81,3 +96,54 @@ def test_write_new_clinvar_variants_parquet_test(
},
],
)

@mock.patch(
'v03_pipeline.lib.tasks.exports.write_new_clinvar_variants_parquet.WriteNewVariantsTableTask',
)
def test_write_new_mito_clinvar_variants_parquet_test(
self,
write_new_variants_table_task: Mock,
) -> None:
write_new_variants_table_task.return_value = MockCompleteTask()
worker = luigi.worker.Worker()
task = WriteNewClinvarVariantsParquetTask(
reference_genome=ReferenceGenome.GRCh38,
dataset_type=DatasetType.MITO,
sample_type=SampleType.WGS,
callset_path='fake_callset',
project_guids=[
'fake_project',
],
project_pedigree_paths=['fake_pedigree'],
skip_validation=True,
run_id=TEST_RUN_ID,
)
worker.add(task)
worker.run()
self.assertTrue(task.output().exists())
self.assertTrue(task.complete())
df = pd.read_parquet(
new_clinvar_variants_parquet_path(
ReferenceGenome.GRCh38,
DatasetType.MITO,
TEST_RUN_ID,
),
)
export_json = convert_ndarray_to_list(df.head(1).to_dict('records'))
self.assertEqual(
export_json,
[
{
'key': 998,
'alleleId': 677824,
'conflictingPathogenicities': None,
'goldStars': 1,
'submitters': [
'Wong Mito Lab, Molecular and Human Genetics, Baylor College of Medicine',
],
'conditions': ['MELAS syndrome'],
'assertions': [],
'pathogenicity': 'Likely_pathogenic',
},
],
)
12 changes: 8 additions & 4 deletions v03_pipeline/lib/tasks/exports/write_new_entries_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
)
from v03_pipeline.lib.paths import (
new_entries_parquet_path,
variant_annotations_table_path,
)
from v03_pipeline.lib.reference_datasets.reference_dataset import (
BaseReferenceDataset,
Expand Down Expand Up @@ -46,10 +47,10 @@ def output(self) -> luigi.Target:
),
)

def requires(self) -> list[luigi.Task]:
def requires(self) -> dict[str, luigi.Task]:
return {
ANNOTATIONS_TABLE_TASK: (
self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask)
ANNOTATIONS_TABLE_TASK: self.clone(
UpdateVariantAnnotationsTableWithNewSamplesTask,
),
REMAPPED_AND_SUBSETTED_CALLSET_TASKS: [
self.clone(
Expand Down Expand Up @@ -93,7 +94,10 @@ def create_table(self) -> None:
)
ht = deglobalize_ids(ht)
annotations_ht = hl.read_table(
self.input()[ANNOTATIONS_TABLE_TASK].path,
variant_annotations_table_path(
self.reference_genome,
self.dataset_type,
),
)
ht = ht.join(annotations_ht)

Expand Down
Loading