chore: Remove code that supports expecting filters (#1072)

bpblanken · web-flow · commit 1c305ab076b8 · 2025-04-15T10:11:32.000-04:00
* Remove code that supports expecting filters

* kebab case test
diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py
@@ -4,7 +4,7 @@
 import hail as hl
 
 from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv
-from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType
+from v03_pipeline.lib.model.definitions import ReferenceGenome
 
 MITO_MIN_HOM_THRESHOLD = 0.95
 ZERO = 0.0
@@ -183,12 +183,6 @@ def has_gencode_ensembl_to_refseq_id_mapping(
             self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38
         )
 
-    def expect_filters(
-        self,
-        sample_type: SampleType,
-    ) -> bool:
-        return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES
-
     def expect_tdr_metrics(
         self,
         reference_genome: ReferenceGenome,
diff --git a/v03_pipeline/lib/model/feature_flag.py b/v03_pipeline/lib/model/feature_flag.py
@@ -7,7 +7,6 @@
 )
 CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
 EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1'
-EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
 INCLUDE_PIPELINE_VERSION_IN_PREFIX = (
     os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1'
 )
@@ -22,7 +21,6 @@ class FeatureFlag:
     ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
     CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
     EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS
-    EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
     INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX
     RUN_PIPELINE_ON_DATAPROC: bool = RUN_PIPELINE_ON_DATAPROC
     SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD
diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py
@@ -1,6 +1,5 @@
 import hashlib
 import os
-import re
 
 import hailtop.fs as hfs
 
@@ -297,24 +296,6 @@ def sex_check_table_path(
     )
 
 
-def valid_filters_path(
-    dataset_type: DatasetType,
-    sample_type: SampleType,
-    callset_path: str,
-) -> str | None:
-    if (
-        not FeatureFlag.EXPECT_WES_FILTERS
-        or not dataset_type.expect_filters(sample_type)
-        or 'part_one_outputs' not in callset_path
-    ):
-        return None
-    return re.sub(
-        'part_one_outputs/.*$',
-        'part_two_outputs/*.filtered.*.vcf.gz',
-        callset_path,
-    )
-
-
 def valid_reference_dataset_path(
     reference_genome: ReferenceGenome,
     reference_dataset: ReferenceDataset,
diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py
@@ -20,7 +20,6 @@
     remapped_and_subsetted_callset_path,
     sex_check_table_path,
     tdr_metrics_path,
-    valid_filters_path,
     validation_errors_for_run_path,
     variant_annotations_table_path,
 )
@@ -66,26 +65,6 @@ def test_family_table_path(self) -> None:
                 '/var/bucket/GRCh37/SNV_INDEL/families/WES/franklin.ht',
             )
 
-    def test_valid_filters_path(self) -> None:
-        self.assertEqual(
-            valid_filters_path(
-                DatasetType.MITO,
-                SampleType.WES,
-                '/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
-            ),
-            None,
-        )
-        with patch('v03_pipeline.lib.paths.FeatureFlag') as mock_ff:
-            mock_ff.EXPECT_WES_FILTERS = True
-            self.assertEqual(
-                valid_filters_path(
-                    DatasetType.SNV_INDEL,
-                    SampleType.WES,
-                    '/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
-                ),
-                '/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
-            )
-
     def test_project_table_path(self) -> None:
         self.assertEqual(
             project_table_path(
diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py
@@ -25,10 +25,6 @@ class BaseLoadingRunParams(luigi.Task):
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
     )
-    skip_expect_filters = luigi.BoolParameter(
-        default=False,
-        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
-    )
     skip_expect_tdr_metrics = luigi.BoolParameter(
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
@@ -105,9 +105,6 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str):
                     'spark-env:EXPECT_TDR_METRICS': '1'
                     if FeatureFlag.EXPECT_TDR_METRICS
                     else '0',
-                    'spark-env:EXPECT_WES_FILTERS': '1'
-                    if FeatureFlag.EXPECT_WES_FILTERS
-                    else '0',
                     'spark-env:HAIL_SEARCH_DATA_DIR': Env.HAIL_SEARCH_DATA_DIR,
                     'spark-env:HAIL_TMP_DIR': Env.HAIL_TMP_DIR,
                     'spark-env:INCLUDE_PIPELINE_VERSION_IN_PREFIX': '1'
diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py
@@ -41,8 +41,6 @@ def test_to_kebab_str_args(self, _: Mock):
                 '["test_pedigree"]',
                 '--skip-check-sex-and-relatedness',
                 'False',
-                '--skip-expect-filters',
-                'False',
                 '--skip-expect-tdr-metrics',
                 'False',
                 '--skip-validation',
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -5,7 +5,6 @@
 from v03_pipeline.lib.misc.callsets import get_additional_row_fields
 from v03_pipeline.lib.misc.io import (
     import_callset,
-    import_vcf,
     select_relevant_fields,
     split_multi_hts,
 )
@@ -14,10 +13,8 @@
     validate_imported_field_types,
 )
 from v03_pipeline.lib.misc.vets import annotate_vets
-from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import (
     imported_callset_path,
-    valid_filters_path,
     variant_annotations_table_path,
 )
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
@@ -43,26 +40,7 @@ def output(self) -> luigi.Target:
         )
 
     def requires(self) -> list[luigi.Task]:
-        requirements = []
-        if (
-            FeatureFlag.EXPECT_WES_FILTERS
-            and not self.skip_expect_filters
-            and self.dataset_type.expect_filters(
-                self.sample_type,
-            )
-        ):
-            requirements = [
-                *requirements,
-                CallsetTask(
-                    valid_filters_path(
-                        self.dataset_type,
-                        self.sample_type,
-                        self.callset_path,
-                    ),
-                ),
-            ]
         return [
-            *requirements,
             CallsetTask(self.callset_path),
         ]
 
@@ -74,21 +52,6 @@ def create_table(self) -> hl.MatrixTable:
             self.reference_genome,
             self.dataset_type,
         )
-        filters_path = None
-        if (
-            FeatureFlag.EXPECT_WES_FILTERS
-            and not self.skip_expect_filters
-            and self.dataset_type.expect_filters(
-                self.sample_type,
-            )
-        ):
-            filters_path = valid_filters_path(
-                self.dataset_type,
-                self.sample_type,
-                self.callset_path,
-            )
-            filters_ht = import_vcf(filters_path, self.reference_genome).rows()
-            mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
         additional_row_fields = get_additional_row_fields(
             mt,
             self.reference_genome,
@@ -139,5 +102,4 @@ def create_table(self) -> hl.MatrixTable:
         mt = annotate_vets(mt)
         return mt.select_globals(
             callset_path=self.callset_path,
-            filters_path=filters_path or hl.missing(hl.tstr),
         )