@@ -42,7 +42,7 @@ case class Variants(rc: RuntimeETLContext, participantId: Column = col("particip
42
42
protected val gnomad_genomes_v2 : DatasetConf = conf.getDataset(" normalized_gnomad_genomes_v2_1_1" )
43
43
protected val gnomad_exomes_v2 : DatasetConf = conf.getDataset(" normalized_gnomad_exomes_v2_1_1" )
44
44
protected val gnomad_genomes_v3 : DatasetConf = conf.getDataset(" normalized_gnomad_genomes_v3" )
45
- protected val gnomad_genomes_v4 : DatasetConf = conf.getDataset(" normalized_gnomad_genomes_v4 " )
45
+ protected val gnomad_joint_v4 : DatasetConf = conf.getDataset(" normalized_gnomad_joint_v4 " )
46
46
protected val dbsnp : DatasetConf = conf.getDataset(" normalized_dbsnp" )
47
47
protected val clinvar : DatasetConf = conf.getDataset(" normalized_clinvar" )
48
48
protected val genes : DatasetConf = conf.getDataset(" enriched_genes" )
@@ -89,7 +89,7 @@ case class Variants(rc: RuntimeETLContext, participantId: Column = col("particip
89
89
90
90
variantsCheckpoint
91
91
.withFrequencies(participantId, affectedStatus, snv, splits, checkpoint)
92
- .withPopulations(data(thousand_genomes.id), data(topmed_bravo.id), data(gnomad_genomes_v2.id), data(gnomad_exomes_v2.id), data(gnomad_genomes_v3.id), data(gnomad_genomes_v4 .id))
92
+ .withPopulations(data(thousand_genomes.id), data(topmed_bravo.id), data(gnomad_genomes_v2.id), data(gnomad_exomes_v2.id), data(gnomad_genomes_v3.id), data(gnomad_joint_v4 .id))
93
93
.withDbSNP(data(dbsnp.id))
94
94
.withClinvar(data(clinvar.id))
95
95
.withGenes(data(genes.id))
@@ -132,7 +132,9 @@ object Variants {
132
132
val conditionValueMap : List [(Column , String )] = List (
133
133
$" clinvar" .isNotNull -> " Clinvar" ,
134
134
$" cmc" .isNotNull -> " Cosmic" ,
135
- $" external_frequencies.gnomad_genomes_4" .isNotNull -> " gnomAD" ,
135
+ ($" external_frequencies.gnomad_genomes_4" .isNotNull or
136
+ $" external_frequencies.gnomad_exomes_4" .isNotNull or
137
+ $" external_frequencies.gnomad_joint_4" .isNotNull) -> " gnomAD" ,
136
138
)
137
139
val dfWithVariantExternalReference = conditionValueMap.foldLeft {
138
140
df.withColumn(outputColumn, when($" rsnumber" .isNotNull, array(lit(" DBSNP" ))).otherwise(array()))
@@ -154,7 +156,7 @@ object Variants {
154
156
gnomadGenomesV2 : DataFrame ,
155
157
gnomadExomesV2 : DataFrame ,
156
158
gnomadGenomesV3 : DataFrame ,
157
- gnomadGenomesV4 : DataFrame )(implicit spark : SparkSession ): DataFrame = {
159
+ gnomadJointV4 : DataFrame )(implicit spark : SparkSession ): DataFrame = {
158
160
import spark .implicits ._
159
161
val shapedThousandGenomes = thousandGenomes
160
162
.selectLocus($" ac" .cast(" long" ), $" af" , $" an" .cast(" long" ))
@@ -169,7 +171,9 @@ object Variants {
169
171
val shapedGnomadGenomesV2 = gnomadGenomesV2.selectLocus($" ac" .cast(" long" ), $" af" , $" an" .cast(" long" ), $" hom" .cast(" long" ))
170
172
val shapedGnomadExomesV2 = gnomadExomesV2.selectLocus($" ac" .cast(" long" ), $" af" , $" an" .cast(" long" ), $" hom" .cast(" long" ))
171
173
val shapedGnomadGenomesV3 = gnomadGenomesV3.selectLocus($" ac" .cast(" long" ), $" af" , $" an" .cast(" long" ), $" nhomalt" .cast(" long" ) as " hom" )
172
- val shapedGnomadGenomesV4 = gnomadGenomesV4.selectLocus($" ac" , $" af" , $" an" , $" hom" )
174
+ val shapedGnomadGenomesV4 = gnomadJointV4.selectLocus($" ac_genomes" as " ac" , $" af_genomes" as " af" , $" an_genomes" as " an" , $" hom_genomes" as " hom" )
175
+ val shapedGnomadExomesV4 = gnomadJointV4.selectLocus($" ac_exomes" as " ac" , $" af_exomes" as " af" , $" an_exomes" as " an" , $" hom_exomes" as " hom" )
176
+ val shapedGnomadJointV4 = gnomadJointV4.selectLocus($" ac_joint" as " ac" , $" af_joint" as " af" , $" an_joint" as " an" , $" hom_joint" as " hom" )
173
177
174
178
df
175
179
.joinAndMerge(shapedThousandGenomes, " thousand_genomes" , " left" )
@@ -178,14 +182,18 @@ object Variants {
178
182
.joinAndMerge(shapedGnomadExomesV2, " gnomad_exomes_2_1_1" , " left" )
179
183
.joinAndMerge(shapedGnomadGenomesV3, " gnomad_genomes_3" , " left" )
180
184
.joinAndMerge(shapedGnomadGenomesV4, " gnomad_genomes_4" , " left" )
185
+ .joinAndMerge(shapedGnomadExomesV4, " gnomad_exomes_4" , " left" )
186
+ .joinAndMerge(shapedGnomadJointV4, " gnomad_joint_4" , " left" )
181
187
.select(df(" *" ),
182
188
struct(
183
189
col(" thousand_genomes" ),
184
190
col(" topmed_bravo" ),
185
191
col(" gnomad_genomes_2_1_1" ),
186
192
col(" gnomad_exomes_2_1_1" ),
187
193
col(" gnomad_genomes_3" ),
188
- col(" gnomad_genomes_4" )) as " external_frequencies" )
194
+ col(" gnomad_genomes_4" ),
195
+ col(" gnomad_exomes_4" ),
196
+ col(" gnomad_joint_4" )) as " external_frequencies" )
189
197
}
190
198
191
199
def withDbSNP (dbsnp : DataFrame ): DataFrame = {
0 commit comments