Skip to content

Commit 82eb935

Browse files
authored
Merge pull request #263 from Ferlab-Ste-Justine/feat/clin-4027
feat: CLIN-4027 use and manage gnomad joint
2 parents ed714a9 + 0d5dd04 commit 82eb935

File tree

12 files changed

+377
-347
lines changed

12 files changed

+377
-347
lines changed

datalake-spark3/src/main/resources/reference_kf.conf

+6-6
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ datalake {
6363
},
6464
{
6565
format=VCF
66-
id="raw_gnomad_genomes_v4"
66+
id="raw_gnomad_joint_v4"
6767
keys=[]
6868
loadtype=OverWrite
6969
partitionby=[]
70-
path="/raw/landing/gnomad_v4/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr[^M]*.vcf.bgz"
70+
path="/raw/landing/gnomad_v4/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr[^M]*.vcf.bgz"
7171
readoptions {
7272
flattenInfoFields="true"
7373
"split_multiallelics"="true"
@@ -874,22 +874,22 @@ datalake {
874874
},
875875
{
876876
format=DELTA
877-
id="normalized_gnomad_genomes_v4"
877+
id="normalized_gnomad_joint_v4"
878878
keys=[]
879879
loadtype=OverWrite
880880
partitionby=[
881881
chromosome
882882
]
883-
path="/public/gnomad_genomes_v4"
883+
path="/public/gnomad_joint_v4"
884884
readoptions {}
885885
storageid="public_database"
886886
table {
887887
database=variant
888-
name="gnomad_genomes_v4"
888+
name="gnomad_joint_v4"
889889
}
890890
view {
891891
database="variant_live"
892-
name="gnomad_genomes_v4"
892+
name="gnomad_joint_v4"
893893
}
894894
writeoptions {
895895
"created_on_column"="created_on"

datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/genomics/enriched/Variants.scala

+14-6
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ case class Variants(rc: RuntimeETLContext, participantId: Column = col("particip
4242
protected val gnomad_genomes_v2: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v2_1_1")
4343
protected val gnomad_exomes_v2: DatasetConf = conf.getDataset("normalized_gnomad_exomes_v2_1_1")
4444
protected val gnomad_genomes_v3: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v3")
45-
protected val gnomad_genomes_v4: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v4")
45+
protected val gnomad_joint_v4: DatasetConf = conf.getDataset("normalized_gnomad_joint_v4")
4646
protected val dbsnp: DatasetConf = conf.getDataset("normalized_dbsnp")
4747
protected val clinvar: DatasetConf = conf.getDataset("normalized_clinvar")
4848
protected val genes: DatasetConf = conf.getDataset("enriched_genes")
@@ -89,7 +89,7 @@ case class Variants(rc: RuntimeETLContext, participantId: Column = col("particip
8989

9090
variantsCheckpoint
9191
.withFrequencies(participantId, affectedStatus, snv, splits, checkpoint)
92-
.withPopulations(data(thousand_genomes.id), data(topmed_bravo.id), data(gnomad_genomes_v2.id), data(gnomad_exomes_v2.id), data(gnomad_genomes_v3.id), data(gnomad_genomes_v4.id))
92+
.withPopulations(data(thousand_genomes.id), data(topmed_bravo.id), data(gnomad_genomes_v2.id), data(gnomad_exomes_v2.id), data(gnomad_genomes_v3.id), data(gnomad_joint_v4.id))
9393
.withDbSNP(data(dbsnp.id))
9494
.withClinvar(data(clinvar.id))
9595
.withGenes(data(genes.id))
@@ -132,7 +132,9 @@ object Variants {
132132
val conditionValueMap: List[(Column, String)] = List(
133133
$"clinvar".isNotNull -> "Clinvar",
134134
$"cmc".isNotNull -> "Cosmic",
135-
$"external_frequencies.gnomad_genomes_4".isNotNull -> "gnomAD",
135+
($"external_frequencies.gnomad_genomes_4".isNotNull or
136+
$"external_frequencies.gnomad_exomes_4".isNotNull or
137+
$"external_frequencies.gnomad_joint_4".isNotNull) -> "gnomAD",
136138
)
137139
val dfWithVariantExternalReference = conditionValueMap.foldLeft {
138140
df.withColumn(outputColumn, when($"rsnumber".isNotNull, array(lit("DBSNP"))).otherwise(array()))
@@ -154,7 +156,7 @@ object Variants {
154156
gnomadGenomesV2: DataFrame,
155157
gnomadExomesV2: DataFrame,
156158
gnomadGenomesV3: DataFrame,
157-
gnomadGenomesV4: DataFrame)(implicit spark: SparkSession): DataFrame = {
159+
gnomadJointV4: DataFrame)(implicit spark: SparkSession): DataFrame = {
158160
import spark.implicits._
159161
val shapedThousandGenomes = thousandGenomes
160162
.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"))
@@ -169,7 +171,9 @@ object Variants {
169171
val shapedGnomadGenomesV2 = gnomadGenomesV2.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"), $"hom".cast("long"))
170172
val shapedGnomadExomesV2 = gnomadExomesV2.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"), $"hom".cast("long"))
171173
val shapedGnomadGenomesV3 = gnomadGenomesV3.selectLocus($"ac".cast("long"), $"af", $"an".cast("long"), $"nhomalt".cast("long") as "hom")
172-
val shapedGnomadGenomesV4 = gnomadGenomesV4.selectLocus($"ac", $"af", $"an", $"hom")
174+
val shapedGnomadGenomesV4 = gnomadJointV4.selectLocus($"ac_genomes" as "ac", $"af_genomes" as "af", $"an_genomes" as "an", $"hom_genomes" as "hom")
175+
val shapedGnomadExomesV4 = gnomadJointV4.selectLocus($"ac_exomes" as "ac", $"af_exomes" as "af", $"an_exomes" as "an", $"hom_exomes" as "hom")
176+
val shapedGnomadJointV4 = gnomadJointV4.selectLocus($"ac_joint" as "ac", $"af_joint" as "af", $"an_joint" as "an", $"hom_joint" as "hom")
173177

174178
df
175179
.joinAndMerge(shapedThousandGenomes, "thousand_genomes", "left")
@@ -178,14 +182,18 @@ object Variants {
178182
.joinAndMerge(shapedGnomadExomesV2, "gnomad_exomes_2_1_1", "left")
179183
.joinAndMerge(shapedGnomadGenomesV3, "gnomad_genomes_3", "left")
180184
.joinAndMerge(shapedGnomadGenomesV4, "gnomad_genomes_4", "left")
185+
.joinAndMerge(shapedGnomadExomesV4, "gnomad_exomes_4", "left")
186+
.joinAndMerge(shapedGnomadJointV4, "gnomad_joint_4", "left")
181187
.select(df("*"),
182188
struct(
183189
col("thousand_genomes"),
184190
col("topmed_bravo"),
185191
col("gnomad_genomes_2_1_1"),
186192
col("gnomad_exomes_2_1_1"),
187193
col("gnomad_genomes_3"),
188-
col("gnomad_genomes_4")) as "external_frequencies")
194+
col("gnomad_genomes_4"),
195+
col("gnomad_exomes_4"),
196+
col("gnomad_joint_4")) as "external_frequencies")
189197
}
190198

191199
def withDbSNP(dbsnp: DataFrame): DataFrame = {

datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/PublicDatasets.scala

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ case class PublicDatasets(alias: String, tableDatabase: Option[String], viewData
1313
DatasetConf("raw_clinvar" , alias, "/raw/landing/clinvar/clinvar.vcf.gz" , VCF , OverWrite , readoptions = Map("flattenInfoFields" -> "true", "split_multiallelics" -> "true")),
1414
DatasetConf("raw_dbsnp" , alias, "/raw/landing/dbsnp/GCF_000001405.40.gz" , VCF , OverWrite , readoptions = Map("flattenInfoFields" -> "true", "split_multiallelics" -> "true")),
1515
DatasetConf("raw_gnomad_genomes_v3" , alias, "/release/3.1/vcf/genomes/gnomad.genomes.v3.1.sites.chr[^M]*.vcf.bgz", VCF , OverWrite , readoptions = Map("flattenInfoFields" -> "true", "split_multiallelics" -> "true")).copy(storageid = gnomadStorageId),
16-
DatasetConf("raw_gnomad_genomes_v4" , alias, "/raw/landing/gnomad_v4/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr[^M]*.vcf.bgz", VCF , OverWrite , readoptions = Map("flattenInfoFields" -> "true", "split_multiallelics" -> "true")),
16+
DatasetConf("raw_gnomad_joint_v4" , alias, "/raw/landing/gnomad_v4/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr[^M]*.vcf.bgz", VCF , OverWrite , readoptions = Map("flattenInfoFields" -> "true", "split_multiallelics" -> "true")),
1717
DatasetConf("raw_gnomad_constraint_v2_1_1" , alias, "/raw/landing/gnomad_v2_1_1/gnomad.v2.1.1.lof_metrics.by_gene.txt.gz", CSV , OverWrite , readoptions = Map("header" -> "true", "sep" -> "\t")),
1818
DatasetConf("raw_topmed_bravo" , alias, "/raw/landing/topmed/bravo-dbsnp-*.vcf.gz" , VCF , OverWrite , readoptions = Map("flattenInfoFields" -> "true", "split_multiallelics" -> "true")),
1919
DatasetConf("raw_1000_genomes" , alias, "/raw/landing/1000Genomes/ALL.*.sites.vcf.gz" , VCF , OverWrite , readoptions = Map("flattenInfoFields" -> "true", "split_multiallelics" -> "true")),
@@ -51,7 +51,7 @@ case class PublicDatasets(alias: String, tableDatabase: Option[String], viewData
5151
DatasetConf("normalized_gnomad_exomes_v2_1_1" , alias, "/public/gnomad_exomes_v2_1_1_liftover_grch38" , DELTA, OverWrite , partitionby = List("chromosome"), table = table("gnomad_exomes_v2_1_1") , view = view("gnomad_exomes_v2_1_1")),
5252
DatasetConf("normalized_gnomad_constraint_v2_1_1", alias, "/public/gnomad_constraint_v2_1_1" , DELTA, OverWrite , partitionby = List("chromosome"), table = table("gnomad_constraint_v_2_1_1"), view = view("gnomad_constraint_v_2_1_1")),
5353
DatasetConf("normalized_gnomad_genomes_v3" , alias, "/public/gnomad_genomes_v3" , DELTA, OverWrite , partitionby = List("chromosome"), table = table("gnomad_genomes_v3") , view = view("gnomad_genomes_v3")),
54-
DatasetConf("normalized_gnomad_genomes_v4" , alias, "/public/gnomad_genomes_v4" , DELTA, OverWrite , partitionby = List("chromosome"), table = table("gnomad_genomes_v4") , view = view("gnomad_genomes_v4")),
54+
DatasetConf("normalized_gnomad_joint_v4" , alias, "/public/gnomad_joint_v4" , DELTA, OverWrite , partitionby = List("chromosome"), table = table("gnomad_joint_v4") , view = view("gnomad_joint_v4")),
5555
DatasetConf("normalized_human_genes" , alias, "/public/human_genes" , DELTA, OverWrite , partitionby = List() , table = table("human_genes") , view = view("human_genes")),
5656
DatasetConf("normalized_hpo_gene_set" , alias, "/public/hpo_gene_set" , DELTA, OverWrite , partitionby = List() , table = table("hpo_gene_set") , view = view("hpo_gene_set")),
5757
DatasetConf("normalized_omim_gene_set" , alias, "/public/omim_gene_set" , DELTA, OverWrite , partitionby = List() , table = table("omim_gene_set") , view = view("omim_gene_set")),

datalake-spark3/src/main/scala/bio/ferlab/datalake/spark3/publictables/normalized/gnomad/GnomadV4.scala

+14-6
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ import java.time.LocalDateTime
1111

1212
case class GnomadV4(rc: RuntimeETLContext) extends SimpleETLP(rc) {
1313

14-
override val mainDestination: DatasetConf = conf.getDataset("normalized_gnomad_genomes_v4")
15-
val gnomad_vcf: DatasetConf = conf.getDataset("raw_gnomad_genomes_v4")
14+
override val mainDestination: DatasetConf = conf.getDataset("normalized_gnomad_joint_v4")
15+
val gnomad_vcf: DatasetConf = conf.getDataset("raw_gnomad_joint_v4")
1616

1717
override def extract(lastRunValue: LocalDateTime = minValue,
1818
currentRunValue: LocalDateTime = LocalDateTime.now()): Map[String, DataFrame] = {
@@ -47,10 +47,18 @@ case class GnomadV4(rc: RuntimeETLContext) extends SimpleETLP(rc) {
4747
$"alternate",
4848
$"qual",
4949
$"name",
50-
$"ac".cast("long"),
51-
$"af",
52-
$"an".cast("long"),
53-
$"nhomalt".cast("long") as "hom"
50+
$"ac_joint".cast("long"),
51+
$"af_joint",
52+
$"an_joint".cast("long"),
53+
$"nhomalt_joint".cast("long") as "hom_joint",
54+
$"ac_genomes".cast("long"),
55+
$"af_genomes",
56+
$"an_genomes".cast("long"),
57+
$"nhomalt_genomes".cast("long") as "hom_genomes",
58+
$"ac_exomes".cast("long"),
59+
$"af_exomes",
60+
$"an_exomes".cast("long"),
61+
$"nhomalt_exomes".cast("long") as "hom_exomes",
5462
)
5563
}
5664

datalake-spark3/src/test/resources/config/reference_kf.conf

+6-6
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ datalake {
6363
},
6464
{
6565
format=VCF
66-
id="raw_gnomad_genomes_v4"
66+
id="raw_gnomad_joint_v4"
6767
keys=[]
6868
loadtype=OverWrite
6969
partitionby=[]
70-
path="/raw/landing/gnomad_v4/release/4.1/vcf/genomes/gnomad.genomes.v4.1.sites.chr[^M]*.vcf.bgz"
70+
path="/raw/landing/gnomad_v4/release/4.1/vcf/joint/gnomad.joint.v4.1.sites.chr[^M]*.vcf.bgz"
7171
readoptions {
7272
flattenInfoFields="true"
7373
"split_multiallelics"="true"
@@ -874,22 +874,22 @@ datalake {
874874
},
875875
{
876876
format=DELTA
877-
id="normalized_gnomad_genomes_v4"
877+
id="normalized_gnomad_joint_v4"
878878
keys=[]
879879
loadtype=OverWrite
880880
partitionby=[
881881
chromosome
882882
]
883-
path="/public/gnomad_genomes_v4"
883+
path="/public/gnomad_joint_v4"
884884
readoptions {}
885885
storageid="public_database"
886886
table {
887887
database=variant
888-
name="gnomad_genomes_v4"
888+
name="gnomad_joint_v4"
889889
}
890890
view {
891891
database="variant_live"
892-
name="gnomad_genomes_v4"
892+
name="gnomad_joint_v4"
893893
}
894894
writeoptions {
895895
"created_on_column"="created_on"

0 commit comments

Comments
 (0)