-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathremove_duplicate_ont_aln.py
76 lines (62 loc) · 3.07 KB
/
remove_duplicate_ont_aln.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import argparse
import pysam
def main():
parser = argparse.ArgumentParser(description='Remove redundant alignment records from ONT BAM file',
prog='remove_redundant_reads')
parser.add_argument('-p', '--prefix', type=str, default="shard", help="Output prefix")
parser.add_argument('-a', '--annotations', type=str, help="Annotations on (potential) duplicate reads")
parser.add_argument('bam', type=str, help="BAM")
args = parser.parse_args()
# create a dict of set's, a trick to avoid Hash collisions
guilty_dict_per_chr = dict()
with open(args.annotations) as f:
for line in f:
arr = line.strip().split('\t')
name = arr[0]
chrom = arr[2]
guilty_dict_per_chr.setdefault(chrom, set())
guilty_dict_per_chr[chrom].add(name)
print("chromosomes on which there are duplicate records:")
print(f" {guilty_dict_per_chr}")
# Silence message about the .bai file not being found.
pysam.set_verbosity(0)
num_alignments, num_dropped_alignments = 0, 0
duplicate_signatures = []
bf = pysam.Samfile(args.bam, 'rb', check_sq=False)
with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out:
# we rely on the observation that for coordinate sorted BAM,
# duplicate records will appear in blocks, hence once we step off a position with duplicates, we start afresh
current_position = -1
current_signatures = set()
for read in bf:
num_alignments += 1
chrom = read.reference_name
n = read.query_name
if chrom in guilty_dict_per_chr and n in guilty_dict_per_chr[chrom]:
mq = read.mapping_quality
sam_flag = read.flag
pos = read.reference_start
cigar = read.cigarstring
signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-{cigar}"
if current_position != pos: # new position, let's write and reset
out.write(read)
current_position = pos
current_signatures = set()
current_signatures.add(signature)
elif signature in current_signatures: # You're a duplicate record, and not appearing for the 1st time!
num_dropped_alignments += 1
duplicate_signatures.append(signature) # same signature may appear more than twice, hence list and append
pass
else: # you are in a new group of duplicates that map to this location
out.write(read)
current_signatures.add(signature)
else:
out.write(read)
print(f'num_alignments: {num_alignments}')
print(f'num_dropped_alignments: {num_dropped_alignments}')
print(f'num_kept_alignments: {num_alignments - num_dropped_alignments}')
with open(f'{args.prefix}.duplicate.signatures.txt', 'w') as out:
for sig in duplicate_signatures:
out.write(f"{sig}\n")
if __name__ == "__main__":
main()