-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathconvert_ICESat2_format.py
348 lines (326 loc) · 13.2 KB
/
convert_ICESat2_format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#!/usr/bin/env python
u"""
convert_ICESat2_format.py
Written by Tyler Sutterley (09/2023)
Converts ICESat-2 HDF5 datafiles to zarr or rechunked HDF5 datafiles
zarr files make large datasets easily accessible to distributed computing on
both local filesystems and cloud-based object stores
- arrays are divided into chunks and compressed
- metadata are stored in lightweight .json files
rechunked HDF5 files can be more optimized for cloud-based object stores
CALLING SEQUENCE:
python convert_ICESat2_format.py --release=003 ATL06
INPUTS:
ATL03: Global Geolocated Photon Data
ATL04: Normalized Relative Backscatter
ATL06: Land Ice Height
ATL07: Sea Ice Height
ATL08: Land and Vegetation Height
ATL09: Atmospheric Layer Characteristics
ATL10: Sea Ice Freeboard
ATL12: Ocean Surface Height
ATL13: Inland Water Surface Height
COMMAND LINE OPTIONS:
--help: list the command line options
-D X, --directory X: working data directory
-Y X, --year X: years to run separated by commas
-S X, --subdirectory X: subdirectories to run separated by commas
-r X, --release X: ICESat-2 data release to run
-v X, --version X: ICESat-2 data version to run
-t X, --track X: ICESat-2 reference ground tracks to run
-c X, --cycle X: ICESat-2 cycle to run
-g X, --granule X: ICESat-2 granule regions to run
--format X: output file format (zarr, HDF5)
--chunks X: Rechunk output files to size
-P X, --np X: Number of processes to use in file conversion
-C, --clobber: Overwrite existing files
-V, --verbose: Verbose output of processing run
-M X, --mode X: Local permissions mode of the converted files
PYTHON DEPENDENCIES:
numpy: Scientific Computing Tools For Python
https://numpy.org
https://numpy.org/doc/stable/user/numpy-for-matlab-users.html
h5py: Python interface for Hierarchal Data Format 5 (HDF5)
https://h5py.org
http://docs.h5py.org/en/stable/index.html
zarr: Chunked, compressed, N-dimensional arrays in Python
https://github.com/zarr-developers/zarr-python
https://zarr.readthedocs.io/en/stable/index.html
pandas: Python Data Analysis Library
https://pandas.pydata.org/
UPDATE HISTORY:
Updated 09/2023: generalized regular expressions for non-entered cases
Updated 12/2022: single implicit import of altimetry tools
Updated 06/2022: use explicit import of convert functions
Updated 05/2022: use argparse descriptions within sphinx documentation
Updated 10/2021: using python logging for handling verbose output
added parsing for converting file lines to arguments
Updated 07/2021: set context for multiprocessing to fork child processes
Updated 01/2021: generalized to output either zarr or rechunked HDF5
Updated 10/2020: using argparse to set parameters. added verbose keyword
added chunks keyword to rechunk output zarr files
using convert module to convert from HDF5 to zarr
Written 06/2020
"""
from __future__ import print_function
import sys
import os
import re
import logging
import pathlib
import argparse
import traceback
import multiprocessing as mp
import icesat2_toolkit as is2tk
# PURPOSE: convert the ICESat-2 elevation data from HDF5 to zarr
# or rechunked HDF5 formats
def convert_ICESat2_format(DIRECTORY, PRODUCTS, RELEASE, VERSIONS, GRANULES,
TRACKS, YEARS=None, SUBDIRECTORY=None, CYCLES=None, FORMAT=None, CHUNKS=None,
PROCESSES=0, CLOBBER=False, VERBOSE=False, MODE=0o775):
# create logger
loglevel = logging.INFO if VERBOSE else logging.CRITICAL
logging.basicConfig(level=loglevel)
# regular expression operator for finding files of a particular granule
# find ICESat-2 HDF5 files in the subdirectory for product and release
if TRACKS:
regex_track = r'|'.join([rf'{T:04d}' for T in TRACKS])
else:
regex_track = r'\d{4}'
if CYCLES:
regex_cycle = r'|'.join([rf'{C:02d}' for C in CYCLES])
else:
regex_cycle = r'\d{2}'
if GRANULES:
regex_granule = r'|'.join([rf'{G:02d}' for G in GRANULES])
else:
regex_granule = r'\d{2}'
if VERSIONS:
regex_version = r'|'.join([rf'{V:02d}' for V in VERSIONS])
else:
regex_version = r'\d{2}'
regex_pattern = (r'(processed_)?({0})(-\d{{2}})?_(\d{{4}})(\d{{2}})(\d{{2}})'
r'(\d{{2}})(\d{{2}})(\d{{2}})_({1})({2})({3})_({4})_({5})(.*?).h5$')
# regular expression operator for finding subdirectories
if SUBDIRECTORY:
# convert particular subdirectories for product
R2 = re.compile(r'('+r'|'.join(SUBDIRECTORY)+r')', re.VERBOSE)
elif YEARS:
# convert particular years for product
regex_years = '|'.join(rf'{y:d}' for y in YEARS)
R2 = re.compile(rf'({regex_years}).(\d+).(\d+)', re.VERBOSE)
else:
# convert all available subdirectories for product
R2 = re.compile(r'(\d+).(\d+).(\d+)', re.VERBOSE)
# build list of HDF5 files
hdf5_file_list = []
# for each ICESat-2 product listed
for p in PRODUCTS:
logging.info(f'PRODUCT={p}')
# local file directory
ddir = DIRECTORY.joinpath(f'{p}.{RELEASE}')
subdirectories = [sd for sd in ddir.iterdir() if R2.match(sd.name)]
# compile regular expression operator for product, release and version
args = (p,regex_track,regex_cycle,regex_granule,RELEASE,regex_version)
R1 = re.compile(regex_pattern.format(*args), re.VERBOSE)
# for each subdirectory
for sd in subdirectories:
# find matching files (for granule, release, version, track)
# add to list of HDF5 files
hdf5_file_list.extend([f for f in sd.iterdir() if R1.match(f.name)])
# convert in series if PROCESSES = 0
if (PROCESSES == 0):
# convert each ICESat-2 data file
for f in hdf5_file_list:
# convert ICESat-2 file to output format
output = convert_HDF5(f,
FORMAT=FORMAT,
CHUNKS=CHUNKS,
CLOBBER=CLOBBER,
MODE=MODE
)
# print the output string
logging.info(output)
else:
# set multiprocessing start method
ctx = mp.get_context("fork")
# convert in parallel with multiprocessing Pool
pool = ctx.Pool(processes=PROCESSES)
# convert each ICESat-2 data file
output = []
for hdf5_file in hdf5_file_list:
# convert ICESat-2 file to output format
kwds = dict(FORMAT=FORMAT,
CHUNKS=CHUNKS,
CLOBBER=CLOBBER,
MODE=MODE
)
output.append(pool.apply_async(multiprocess_convert,
args=(hdf5_file,), kwds=kwds))
# start multiprocessing jobs
# close the pool
# prevents more tasks from being submitted to the pool
pool.close()
# exit the completed processes
pool.join()
# print the output string
for out in output:
logging.info(out.get())
# PURPOSE: wrapper for running conversion program in multiprocessing mode
def multiprocess_convert(hdf5_file, FORMAT=None, CHUNKS=None, CLOBBER=False,
MODE=0o775):
try:
output = convert_HDF5(hdf5_file,
FORMAT=FORMAT,
CHUNKS=CHUNKS,
CLOBBER=CLOBBER,
MODE=MODE
)
except Exception as exc:
# if there has been an error exception
# print the type, value, and stack trace of the
# current exception being handled
logging.critical(f'process id {os.getpid():d} failed')
logging.error(traceback.format_exc())
else:
return output
# PURPOSE: convert the HDF5 file and change permissions
def convert_HDF5(input_file,
FORMAT=None,
CHUNKS=None,
CLOBBER=False,
MODE=0o775
):
# convert HDF5 file into output format
if (FORMAT == 'zarr'):
output_file = input_file.with_suffix('.zarr')
elif (FORMAT == 'HDF5'):
output_file = input_file.with_suffix('.h5')
# if output file exists in file system: check if HDF5 file is newer
TEST = False
OVERWRITE = ' (clobber)'
# last modification time of HDF5 file
input_mtime = input_file.stat().st_mtime
# check if local version of file exists
if output_file.exists():
# check last modification time of output file
output_mtime = output_file.stat().st_mtime
# if HDF5 file is newer: overwrite the output file
if (input_mtime > output_mtime):
TEST = True
OVERWRITE = ' (overwrite)'
else:
TEST = True
OVERWRITE = ' (new)'
# if file does not exist, is to be overwritten, or CLOBBER is set
if TEST or CLOBBER:
# output string for printing files transferred
args = (str(input_file), str(output_file), OVERWRITE)
output = '{0} -->\n\t{1}{2}\n'.format(*args)
# copy everything from the HDF5 file to the output file
conv = is2tk.convert(filename=input_file, reformat=FORMAT)
conv.file_converter(chunks=CHUNKS)
# keep remote modification time of file and local access time
os.utime(output_file, (output_file.stat().st_atime, input_mtime))
output_file.chmod(mode=MODE)
# return the output string
return output
# PURPOSE: create argument parser
def arguments():
parser = argparse.ArgumentParser(
description="""Converts ICESat-2 HDF5 datafiles to zarr or
rechunked HDF5 datafiles
""",
fromfile_prefix_chars="@"
)
parser.convert_arg_line_to_args = is2tk.utilities.convert_arg_line_to_args
# ICESat-2 Products
PRODUCTS = {}
PRODUCTS['ATL03'] = 'Global Geolocated Photon Data'
PRODUCTS['ATL04'] = 'Normalized Relative Backscatter'
PRODUCTS['ATL06'] = 'Land Ice Height'
PRODUCTS['ATL07'] = 'Sea Ice Height'
PRODUCTS['ATL08'] = 'Land and Vegetation Height'
PRODUCTS['ATL09'] = 'Atmospheric Layer Characteristics'
PRODUCTS['ATL10'] = 'Sea Ice Freeboard'
PRODUCTS['ATL12'] = 'Ocean Surface Height'
PRODUCTS['ATL13'] = 'Inland Water Surface Height'
# command line parameters
parser.add_argument('products',
metavar='PRODUCTS', type=str, nargs='+',
choices=PRODUCTS.keys(),
help='ICESat-2 products to convert')
# working data directory
parser.add_argument('--directory','-D',
type=pathlib.Path,
default=pathlib.Path.cwd(),
help='Working data directory')
# years of data to run
parser.add_argument('--year','-Y',
type=int, nargs='+',
help='Years to run')
# subdirectories of data to run
parser.add_argument('--subdirectory','-S',
type=str, nargs='+',
help='subdirectories of data to run')
# ICESat-2 data release
parser.add_argument('--release','-r',
type=str, default='006',
help='ICESat-2 Data Release')
# ICESat-2 data version
parser.add_argument('--version','-v',
type=int, nargs='+',
help='ICESat-2 Data Version')
# ICESat-2 granule region
parser.add_argument('--granule','-g',
metavar='REGION', type=int, nargs='+',
choices=range(1,15), default=range(1,15),
help='ICESat-2 Granule Region')
# ICESat-2 orbital cycle
parser.add_argument('--cycle','-c',
type=int, nargs='+', default=None,
help='ICESat-2 orbital cycles to convert')
# ICESat-2 reference ground tracks
parser.add_argument('--track','-t',
metavar='RGT', type=int, nargs='+',
choices=range(1,1388), default=range(1,1388),
help='ICESat-2 Reference Ground Tracks (RGTs)')
# output file format
parser.add_argument('--format',
type=str, choices=('zarr','HDF5'), default='zarr',
help='Output file format')
# rechunk output data
parser.add_argument('--chunks',
type=int,
help='Rechunk output files to size')
# run conversion in series if processes is 0
parser.add_argument('--np','-P',
metavar='PROCESSES', type=int, default=0,
help='Number of processes to use in file conversion')
# clobber will overwrite the existing data
parser.add_argument('--clobber','-C',
default=False, action='store_true',
help='Overwrite existing data')
# verbose will output information about each output file
parser.add_argument('--verbose','-V',
default=False, action='store_true',
help='Verbose output of run')
# permissions mode of the converted files (number in octal)
parser.add_argument('--mode','-M',
type=lambda x: int(x,base=8), default=0o775,
help='Permissions mode of output files')
# return the parser
return parser
# This is the main part of the program that calls the individual functions
def main():
# Read the system arguments listed after the program
parser = arguments()
args,_ = parser.parse_known_args()
# convert HDF5 files for each data product
convert_ICESat2_format(args.directory, args.products, args.release,
args.version, args.granule, args.track, YEARS=args.year,
SUBDIRECTORY=args.subdirectory, CYCLES=args.cycle,
FORMAT=args.format, CHUNKS=args.chunks, PROCESSES=args.np,
CLOBBER=args.clobber, VERBOSE=args.verbose, MODE=args.mode)
# run main program
if __name__ == '__main__':
main()