-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataloader.py
93 lines (74 loc) · 2.58 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import csv
import pandas as pd
import glob
import os
def parse_a3m(file_path):
"""
Parses an A3M file and extracts sequences.
Parameters
----------
file_path : str
Path to the A3M file.
Returns
-------
dict
Dictionary where keys are sequence headers and values are the corresponding
sequences from the file.
"""
sequences = {}
with open(file_path, "r") as file:
header = None
for line in file:
line = line.strip() # Remove leading and trailing white space
if line.startswith(">"): # Header detection
header = line[1:] # Remove '>'
sequences[header] = ""
elif header:
sequences[header] += line
return sequences
def save_to_csv(sequences, output_path):
"""
Saves sequences to a csv file.
Parameters
----------
sequences: dict
Protein sequences with headers as keys and sequences as values.
output_path : str
Path for the desired output file.
Returns
-------
None
"""
with open(output_path, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Header", "Sequence"])
for header, sequence in sequences.items():
writer.writerow([header, sequence])
def load_a3m_files(root_dir, output_csv):
"""
Load A3M files from all proteins from a directory and save the sequences to a CSV file.
Parameters
----------
root_dir : str
Root directory where protein subdirectories and A3M files are located.
output_csv : str
Path to save the CSV file with sequences.
Returns
-------
None
"""
all_protein_sequences = {}
# Gets to the different protein folders
for protein_dir in glob.glob(os.path.join(root_dir, '*'), recursive=False):
if os.path.isdir(protein_dir): # Check if it's a subdirectory
protein_name = os.path.basename(protein_dir) # Assign the name of the dir to the sequences
protein_sequences = {}
# Find all A3M files in the subdirectory
for file_path in glob.glob(os.path.join(protein_dir, '*.a3m')):
sequences = parse_a3m(file_path)
protein_sequences.update(sequences) # Add sequences to the protein's group
all_protein_sequences[protein_name] = protein_sequences # Store sequences grouped by protein
# Save to CSV file
save_to_csv(all_protein_sequences, output_csv)
load_a3m_files("~/openfold","all_sequences")
pd.read_csv("all_sequences")