forked from physionetchallenges/python-example-2025
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepare_ptbxl_data.py
187 lines (149 loc) · 6.7 KB
/
prepare_ptbxl_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python
# Load libraries.
import argparse
import numpy as np
import os
import os.path
import pandas as pd
import shutil
import sys
import wfdb
from helper_code import find_records, get_signal_files, is_integer
# Parse arguments.
def get_parser():
description = 'Prepare the PTB-XL database for use in the Challenge.'
parser = argparse.ArgumentParser(description=description)
parser.add_argument('-i', '--input_folder', type=str, required=True) # records100 or records500
parser.add_argument('-d', '--ptbxl_database_file', type=str, required=True) # ptbxl_database.csv
parser.add_argument('-f', '--signal_format', type=str, required=False, default='dat', choices=['dat', 'mat'])
parser.add_argument('-o', '--output_folder', type=str, required=True)
return parser
# Suppress stdout for noisy commands.
from contextlib import contextmanager
@contextmanager
def suppress_stdout():
with open(os.devnull, 'w') as devnull:
stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = stdout
# Convert .dat files to .mat files (optional).
def convert_dat_to_mat(record, write_dir=None):
import wfdb.io.convert
# Change the current working directory; wfdb.io.convert.matlab.wfdb_to_matlab places files in the current working directory.
cwd = os.getcwd()
if write_dir:
os.chdir(write_dir)
# Convert the .dat file to a .mat file.
with suppress_stdout():
wfdb.io.convert.matlab.wfdb_to_mat(record)
# Remove the .dat file.
os.remove(record + '.hea')
os.remove(record + '.dat')
# Rename the .mat file.
os.rename(record + 'm' + '.hea', record + '.hea')
os.rename(record + 'm' + '.mat', record + '.mat')
# Update the header file with the renamed record and .mat file.
with open(record + '.hea', 'r') as f:
output_string = ''
for l in f:
if l.startswith('#Creator') or l.startswith('#Source'):
pass
else:
l = l.replace(record + 'm', record)
output_string += l
with open(record + '.hea', 'w') as f:
f.write(output_string)
# Change the current working directory back to the previous current working directory.
if write_dir:
os.chdir(cwd)
# Fix the checksums from the Python WFDB library.
def fix_checksums(record, checksums=None):
if checksums is None:
x = wfdb.rdrecord(record, physical=False)
signals = np.asarray(x.d_signal)
checksums = np.sum(signals, axis=0, dtype=np.int16)
header_filename = os.path.join(record + '.hea')
string = ''
with open(header_filename, 'r') as f:
for i, l in enumerate(f):
if i == 0:
arrs = l.split(' ')
num_leads = int(arrs[1])
if 0 < i <= num_leads and not l.startswith('#'):
arrs = l.split(' ')
arrs[6] = str(checksums[i-1])
l = ' '.join(arrs)
string += l
with open(header_filename, 'w') as f:
f.write(string)
# Run script.
def run(args):
# Load the demographic information.
df = pd.read_csv(args.ptbxl_database_file, index_col='ecg_id')
# Identify the header files.
records = find_records(args.input_folder)
# Update the header files to include demographics data and copy the signal files unchanged.
for record in records:
# Extract the demographics data.
record_path, record_basename = os.path.split(record)
ecg_id = int(record_basename.split('_')[0])
row = df.loc[ecg_id]
recording_date_string = row['recording_date']
date_string, time_string = recording_date_string.split(' ')
yyyy, mm, dd = date_string.split('-')
date_string = f'{dd}/{mm}/{yyyy}'
age = row['age']
age = int(age) if is_integer(age) else float(age)
sex = row['sex']
if sex == 0:
sex = 'Male'
elif sex == 1:
sex = 'Female'
else:
sex = 'Unknown'
height = row['height']
height = int(height) if is_integer(height) else float(height)
weight = row['weight']
weight = int(weight) if is_integer(weight) else float(weight)
# Assume that all of the patients are negative for Chagas, which is likely to be the case for every or almost every patient
# in the PTB-XL dataset.
label = False
# Update the header file.
input_header_file = os.path.join(args.input_folder, record + '.hea')
output_header_file = os.path.join(args.output_folder, record + '.hea')
input_path = os.path.join(args.input_folder, record_path)
output_path = os.path.join(args.output_folder, record_path)
os.makedirs(output_path, exist_ok=True)
with open(input_header_file, 'r') as f:
input_header = f.read()
lines = input_header.split('\n')
record_line = ' '.join(lines[0].strip().split(' ')[:4]) + '\n'
signal_lines = '\n'.join(l.strip() for l in lines[1:] \
if l.strip() and not l.startswith('#')) + '\n'
comment_lines = '\n'.join(l.strip() for l in lines[1:] \
if l.startswith('#') and not any((l.startswith(x) for x in ('# Age:', '# Sex:', '# Height:', '# Weight:', '# Chagas label:')))) + '\n'
record_line = record_line.strip() + f' {time_string} {date_string} ' + '\n'
signal_lines = signal_lines.strip() + '\n'
comment_lines = comment_lines.strip() + f'# Age: {age}\n# Sex: {sex}\n# Height: {height}\n# Weight: {weight}\n# Chagas label: {label}\n'
output_header = record_line + signal_lines + comment_lines
with open(output_header_file, 'w') as f:
f.write(output_header)
# Copy the signal files if the input and output folders are different.
if os.path.normpath(args.input_folder) != os.path.normpath(args.output_folder):
relative_path = os.path.split(record)[0]
signal_files = get_signal_files(input_header_file)
for signal_file in signal_files:
input_signal_file = os.path.join(args.input_folder, relative_path, signal_file)
output_signal_file = os.path.join(args.output_folder, relative_path, signal_file)
if os.path.isfile(input_signal_file):
shutil.copy2(input_signal_file, output_signal_file)
# Convert data from .dat files to .mat files as requested.
if args.signal_format in ('mat', '.mat'):
convert_dat_to_mat(record, write_dir=args.output_folder)
# Recompute the checksums as needed.
fix_checksums(os.path.join(args.output_folder, record))
if __name__=='__main__':
run(get_parser().parse_args(sys.argv[1:]))