Skip to content

Commit 85bf51d

Browse files
authored
Merge pull request #1075 from broadinstitute/remap-from-ped-file
Remap from ped file
2 parents 08692fa + 96edfd3 commit 85bf51d

36 files changed

+110
-248
lines changed

v03_pipeline/bin/pipeline_worker.py

-11
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from v03_pipeline.lib.paths import (
1212
loading_pipeline_queue_path,
1313
project_pedigree_path,
14-
project_remap_path,
1514
)
1615
from v03_pipeline.lib.tasks.trigger_hail_backend_reload import TriggerHailBackendReload
1716
from v03_pipeline.lib.tasks.write_success_file import WriteSuccessFileTask
@@ -26,15 +25,6 @@ def main():
2625
continue
2726
with open(loading_pipeline_queue_path()) as f:
2827
lpr = LoadingPipelineRequest.model_validate_json(f.read())
29-
project_remap_paths = [
30-
project_remap_path(
31-
lpr.reference_genome,
32-
lpr.dataset_type,
33-
lpr.sample_type,
34-
project_guid,
35-
)
36-
for project_guid in lpr.projects_to_run
37-
]
3828
project_pedigree_paths = [
3929
project_pedigree_path(
4030
lpr.reference_genome,
@@ -49,7 +39,6 @@ def main():
4939
)
5040
loading_run_task_params = {
5141
'project_guids': lpr.projects_to_run,
52-
'project_remap_paths': project_remap_paths,
5342
'project_pedigree_paths': project_pedigree_paths,
5443
'run_id': run_id,
5544
**{k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'},

v03_pipeline/lib/misc/io.py

+3-13
Original file line numberDiff line numberDiff line change
@@ -283,31 +283,21 @@ def import_tdr_qc_metrics(file_path: str) -> hl.Table:
283283
return ht.key_by(ht.s)
284284

285285

286-
def import_remap(remap_path: str) -> hl.Table:
287-
ht = hl.import_table(remap_path)
288-
ht = ht.select(
289-
s=ht.s,
290-
seqr_id=ht.seqr_id,
291-
)
292-
return ht.key_by(ht.s)
293-
294-
295286
def import_pedigree(pedigree_path: str) -> hl.Table:
296287
ht = hl.import_table(pedigree_path, missing='')
288+
optional_selects = {'remap_id': ht.VCF_ID} if 'VCF_ID' in ht.row else {}
297289
return ht.select(
298290
sex=ht.Sex,
299291
family_guid=ht.Family_GUID,
300292
s=ht.Individual_ID,
301293
maternal_s=ht.Maternal_ID,
302294
paternal_s=ht.Paternal_ID,
295+
**optional_selects,
303296
)
304297

305298

306-
def remap_pedigree_hash(remap_path: str, pedigree_path: str) -> hl.Int32Expression:
299+
def remap_pedigree_hash(pedigree_path: str) -> hl.Int32Expression:
307300
sha256 = hashlib.sha256()
308-
if hfs.exists(remap_path):
309-
with hfs.open(remap_path) as f1:
310-
sha256.update(f1.read().encode('utf8'))
311301
with hfs.open(pedigree_path) as f2:
312302
sha256.update(f2.read().encode('utf8'))
313303
# maximum 4 byte int

v03_pipeline/lib/misc/io_test.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,8 @@
2020
'v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv'
2121
)
2222
TEST_INVALID_VCF = 'v03_pipeline/var/test/callsets/improperly_formatted.vcf'
23-
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
23+
TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'
2424
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
25-
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
2625

2726

2827
class IOTest(unittest.TestCase):
@@ -61,11 +60,10 @@ def test_remap_pedigree_hash(self) -> None:
6160
self.assertEqual(
6261
hl.eval(
6362
remap_pedigree_hash(
64-
TEST_REMAP,
65-
TEST_PEDIGREE_3,
63+
TEST_PEDIGREE_3_REMAP,
6664
),
6765
),
68-
-560434714,
66+
573002191,
6967
)
7068

7169
def test_import_vcf(self) -> None:
@@ -97,7 +95,7 @@ def test_import_vcf(self) -> None:
9795
SeqrValidationError,
9896
'VCF failed file format validation: Your input file has a malformed header: We never saw the required CHROM header line \\(starting with one #\\) for the input VCF file',
9997
import_vcf,
100-
TEST_PEDIGREE_3,
98+
TEST_PEDIGREE_3_REMAP,
10199
ReferenceGenome.GRCh38,
102100
)
103101
self.assertRaisesRegex(

v03_pipeline/lib/misc/pedigree.py

+7
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,10 @@ def parse_pedigree_ht_to_families(
170170
):
171171
families.add(Family.parse(family_guid, list(rows)))
172172
return families
173+
174+
175+
def parse_pedigree_ht_to_remap_ht(pedigree_ht: hl.Table) -> hl.Table:
176+
ht = pedigree_ht.filter(hl.is_defined(pedigree_ht.remap_id))
177+
ht = ht.annotate(seqr_id=ht.s)
178+
ht = ht.key_by(s=ht.remap_id)
179+
return ht.select('seqr_id')

v03_pipeline/lib/paths.py

-18
Original file line numberDiff line numberDiff line change
@@ -399,24 +399,6 @@ def clinvar_dataset_path(reference_genome: ReferenceGenome, etag: str) -> str:
399399
)
400400

401401

402-
def project_remap_path(
403-
reference_genome: ReferenceGenome,
404-
dataset_type: DatasetType,
405-
sample_type: SampleType,
406-
project_guid: str,
407-
) -> str:
408-
return os.path.join(
409-
pipeline_prefix(
410-
Env.LOADING_DATASETS_DIR,
411-
reference_genome,
412-
dataset_type,
413-
),
414-
'remaps',
415-
sample_type.value,
416-
f'{project_guid}_remap.tsv',
417-
)
418-
419-
420402
def project_pedigree_path(
421403
reference_genome: ReferenceGenome,
422404
dataset_type: DatasetType,

v03_pipeline/lib/paths_test.py

-12
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
metadata_for_run_path,
1616
new_variants_table_path,
1717
project_pedigree_path,
18-
project_remap_path,
1918
project_table_path,
2019
relatedness_check_table_path,
2120
remapped_and_subsetted_callset_path,
@@ -225,17 +224,6 @@ def test_new_variants_table_path(self) -> None:
225224
'/var/seqr/seqr-hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht',
226225
)
227226

228-
def test_project_remap_path(self) -> None:
229-
self.assertEqual(
230-
project_remap_path(
231-
ReferenceGenome.GRCh38,
232-
DatasetType.SNV_INDEL,
233-
SampleType.WGS,
234-
'R0652_pipeline_test',
235-
),
236-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/remaps/WGS/R0652_pipeline_test_remap.tsv',
237-
)
238-
239227
def test_project_pedigree_path(self) -> None:
240228
self.assertEqual(
241229
project_pedigree_path(

v03_pipeline/lib/tasks/base/base_loading_run_params.py

-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ class BaseLoadingRunParams(luigi.Task):
2020
sample_type = luigi.EnumParameter(enum=SampleType)
2121
callset_path = luigi.Parameter()
2222
project_guids = luigi.ListParameter(default=[])
23-
project_remap_paths = luigi.ListParameter(default=[])
2423
project_pedigree_paths = luigi.ListParameter(default=[])
2524
skip_check_sex_and_relatedness = luigi.BoolParameter(
2625
default=False,

v03_pipeline/lib/tasks/dataproc/misc_test.py

-3
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ def test_to_kebab_str_args(self, _: Mock):
1919
sample_type=SampleType.WGS,
2020
callset_path='test_callset',
2121
project_guids=['R0113_test_project'],
22-
project_remap_paths=['test_remap'],
2322
project_pedigree_paths=['test_pedigree'],
2423
run_id='a_misc_run',
2524
)
@@ -38,8 +37,6 @@ def test_to_kebab_str_args(self, _: Mock):
3837
'test_callset',
3938
'--project-guids',
4039
'["R0113_test_project"]',
41-
'--project-remap-paths',
42-
'["test_remap"]',
4340
'--project-pedigree-paths',
4441
'["test_pedigree"]',
4542
'--skip-check-sex-and-relatedness',

v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py

-2
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ def test_rsync_to_seqr_app_dirs_no_sync(
3434
sample_type=SampleType.WGS,
3535
callset_path='test_callset',
3636
project_guids=['R0113_test_project'],
37-
project_remap_paths=['test_remap'],
3837
project_pedigree_paths=['test_pedigree'],
3938
run_id='manual__2024-04-01',
4039
)
@@ -77,7 +76,6 @@ def test_rsync_to_seqr_app_dirs_sync(
7776
sample_type=SampleType.WGS,
7877
callset_path='test_callset',
7978
project_guids=['R0113_test_project'],
80-
project_remap_paths=['test_remap'],
8179
project_pedigree_paths=['test_pedigree'],
8280
run_id='manual__2024-04-02',
8381
)

v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc_test.py

-4
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ def test_job_already_exists_failed(
4545
sample_type=SampleType.WGS,
4646
callset_path='test_callset',
4747
project_guids=['R0113_test_project'],
48-
project_remap_paths=['test_remap'],
4948
project_pedigree_paths=['test_pedigree'],
5049
run_id='manual__2024-04-03',
5150
)
@@ -79,7 +78,6 @@ def test_job_already_exists_success(
7978
sample_type=SampleType.WGS,
8079
callset_path='test_callset',
8180
project_guids=['R0113_test_project'],
82-
project_remap_paths=['test_remap'],
8381
project_pedigree_paths=['test_pedigree'],
8482
run_id='manual__2024-04-04',
8583
)
@@ -111,7 +109,6 @@ def test_job_failed(
111109
sample_type=SampleType.WGS,
112110
callset_path='test_callset',
113111
project_guids=['R0113_test_project'],
114-
project_remap_paths=['test_remap'],
115112
project_pedigree_paths=['test_pedigree'],
116113
run_id='manual__2024-04-05',
117114
)
@@ -152,7 +149,6 @@ def test_job_success(
152149
sample_type=SampleType.WGS,
153150
callset_path='test_callset',
154151
project_guids=['R0113_test_project'],
155-
project_remap_paths=['test_remap'],
156152
project_pedigree_paths=['test_pedigree'],
157153
run_id='manual__2024-04-06',
158154
)

v03_pipeline/lib/tasks/trigger_hail_backend_reload_test.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@
1212
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1313

1414
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
15-
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
16-
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
15+
TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'
1716

1817

1918
class TriggerHailBackendReloadTestCase(MockedDatarootTestCase):
@@ -35,7 +34,7 @@ def setUp(self) -> None:
3534
callset=TEST_VCF,
3635
project_guid='R0113_test_project',
3736
remap_pedigree_hash=hl.eval(
38-
remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3),
37+
remap_pedigree_hash(TEST_PEDIGREE_3_REMAP),
3938
),
4039
),
4140
},
@@ -71,8 +70,7 @@ def test_success(
7170
callset_path=TEST_VCF,
7271
project_guids=['R0113_test_project'],
7372
run_id='manual__2024-09-20',
74-
project_remap_paths=[TEST_REMAP],
75-
project_pedigree_paths=[TEST_PEDIGREE_3],
73+
project_pedigree_paths=[TEST_PEDIGREE_3_REMAP],
7674
)
7775
worker.add(task)
7876
worker.run()
@@ -100,8 +98,7 @@ def test_failure(
10098
callset_path=TEST_VCF,
10199
project_guids=['R0113_test_project'],
102100
run_id='manual__2024-09-20',
103-
project_remap_paths=[TEST_REMAP],
104-
project_pedigree_paths=[TEST_PEDIGREE_3],
101+
project_pedigree_paths=[TEST_PEDIGREE_3_REMAP],
105102
)
106103
worker.add(task)
107104
self.assertFalse(task.complete())

v03_pipeline/lib/tasks/update_lookup_table.py

-3
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ def complete(self) -> bool:
3333
callset=self.callset_path,
3434
project_guid=project_guid,
3535
remap_pedigree_hash=remap_pedigree_hash(
36-
self.project_remap_paths[i],
3736
self.project_pedigree_paths[i],
3837
),
3938
),
@@ -63,7 +62,6 @@ def update_table(self, ht: hl.Table) -> hl.Table:
6362
callset=self.callset_path,
6463
project_guid=project_guid,
6564
remap_pedigree_hash=remap_pedigree_hash(
66-
self.project_remap_paths[i],
6765
self.project_pedigree_paths[i],
6866
),
6967
),
@@ -102,7 +100,6 @@ def update_table(self, ht: hl.Table) -> hl.Table:
102100
callset=self.callset_path,
103101
project_guid=project_guid,
104102
remap_pedigree_hash=remap_pedigree_hash(
105-
self.project_remap_paths[i],
106103
self.project_pedigree_paths[i],
107104
),
108105
),

v03_pipeline/lib/tasks/update_lookup_table_test.py

+5-8
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1010

1111
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
12-
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
13-
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
12+
TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'
1413

1514
TEST_RUN_ID = 'manual__2024-04-03'
1615

@@ -26,8 +25,7 @@ def test_skip_update_lookup_table_task(self) -> None:
2625
project_guids=[
2726
'R0555_seqr_demo',
2827
], # a project excluded from the lookup table
29-
project_remap_paths=[TEST_REMAP],
30-
project_pedigree_paths=[TEST_PEDIGREE_3],
28+
project_pedigree_paths=[TEST_PEDIGREE_3_REMAP],
3129
skip_validation=True,
3230
run_id=TEST_RUN_ID,
3331
)
@@ -47,7 +45,7 @@ def test_skip_update_lookup_table_task(self) -> None:
4745
callset=TEST_VCF,
4846
project_guid='R0555_seqr_demo',
4947
remap_pedigree_hash=hl.eval(
50-
remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3),
48+
remap_pedigree_hash(TEST_PEDIGREE_3_REMAP),
5149
),
5250
),
5351
},
@@ -65,8 +63,7 @@ def test_update_lookup_table_task(self) -> None:
6563
sample_type=SampleType.WGS,
6664
callset_path=TEST_VCF,
6765
project_guids=['R0113_test_project'],
68-
project_remap_paths=[TEST_REMAP],
69-
project_pedigree_paths=[TEST_PEDIGREE_3],
66+
project_pedigree_paths=[TEST_PEDIGREE_3_REMAP],
7067
skip_validation=True,
7168
run_id=TEST_RUN_ID,
7269
)
@@ -86,7 +83,7 @@ def test_update_lookup_table_task(self) -> None:
8683
callset=TEST_VCF,
8784
project_guid='R0113_test_project',
8885
remap_pedigree_hash=hl.eval(
89-
remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3),
86+
remap_pedigree_hash(TEST_PEDIGREE_3_REMAP),
9087
),
9188
),
9289
},

v03_pipeline/lib/tasks/update_project_table.py

-2
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def complete(self) -> bool:
4040
hl.Struct(
4141
callset=self.callset_path,
4242
remap_pedigree_hash=remap_pedigree_hash(
43-
self.project_remap_paths[self.project_i],
4443
self.project_pedigree_paths[self.project_i],
4544
),
4645
),
@@ -100,7 +99,6 @@ def update_table(self, ht: hl.Table) -> hl.Table:
10099
hl.Struct(
101100
callset=self.callset_path,
102101
remap_pedigree_hash=remap_pedigree_hash(
103-
self.project_remap_paths[self.project_i],
104102
self.project_pedigree_paths[self.project_i],
105103
),
106104
),

0 commit comments

Comments
 (0)