Skip to content

Commit 08b8772

Browse files
committed
Creates PoC pipeline to estimate disk usage of vcf_to_bq on Dataflow.
The pipeline uses raw file size and raw+encoded sizes of a short snippet at beginning of VCF files to estimate the encoded size for a commit. The major blocking bug is that when the snippets are being read from VCFs in an encoded format, lines are being read more than once.
1 parent 6420e96 commit 08b8772

File tree

6 files changed

+372
-0
lines changed

6 files changed

+372
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# Copyright 2017 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""A source for reading VCF file headers."""
16+
17+
from __future__ import absolute_import
18+
19+
import logging
20+
21+
from apache_beam.io import filebasedsource
22+
from apache_beam.io import range_trackers # pylint: disable=unused-import
23+
from apache_beam.io.filesystem import CompressionTypes
24+
from apache_beam.io.filesystems import FileSystems
25+
from apache_beam.io.iobase import Read
26+
from apache_beam.transforms import PTransform
27+
from typing import Iterable # pylint: disable=unused-import
28+
29+
from gcp_variant_transforms.beam_io import vcf_parser, vcfio
30+
31+
32+
class _VcfSnippetSource(filebasedsource.FileBasedSource):
33+
"""A source for reading a limited number of variants from a set of VCF files.
34+
35+
Lines that are malformed are skipped.
36+
37+
Parses VCF files (version 4) using PyVCF library.
38+
"""
39+
40+
DEFAULT_VCF_READ_BUFFER_SIZE = 65536 # 64kB
41+
42+
def __init__(self,
43+
file_pattern,
44+
snippet_size,
45+
compression_type=CompressionTypes.AUTO,
46+
validate=True):
47+
# type: (str, int, str, bool) -> None
48+
super(_VcfSnippetSource, self).__init__(file_pattern,
49+
compression_type=compression_type,
50+
validate=validate,
51+
splittable=False)
52+
self._compression_type = compression_type
53+
self._snippet_size = snippet_size
54+
55+
def read_records(
56+
self,
57+
file_name, # type: str
58+
range_tracker # type: range_trackers.UnsplittableRangeTracker
59+
):
60+
# type: (...) -> Iterable[Tuple[str, str, vcfio.Variant]]
61+
# Iterator to emit lines encoded as `Variant` objects.
62+
record_iterator = vcf_parser.PyVcfParser(
63+
file_name,
64+
range_tracker,
65+
self._pattern,
66+
self._compression_type,
67+
allow_malformed_records=True,
68+
representative_header_lines=None,
69+
buffer_size=self.DEFAULT_VCF_READ_BUFFER_SIZE,
70+
skip_header_lines=0)
71+
72+
# Open distinct channel to read lines as raw bytestrings.
73+
with FileSystems.open(file_name, self._compression_type) as raw_reader:
74+
line = raw_reader.readline()
75+
while line and line.startswith('#'):
76+
# Skip headers, assume header size is negligible.
77+
line = raw_reader.readline()
78+
79+
count = 0
80+
for encoded_record in record_iterator:
81+
raw_record = raw_reader.readline()
82+
83+
if count >= self._snippet_size:
84+
break
85+
if not isinstance(encoded_record, vcfio.Variant):
86+
continue
87+
88+
count += 1
89+
yield file_name, raw_record, encoded_record
90+
91+
92+
class ReadVcfSnippet(PTransform):
93+
"""A PTransform for reading a limited number of lines from a set of VCF files.
94+
95+
Output will be a PTable mapping from `file names -> Tuple[(line, Variant)]`
96+
objects. The list contains the first `snippet_size` number of lines that are
97+
not malformed, first as a raw string and then encoded as a `Variant` class.
98+
99+
Parses VCF files (version 4) using PyVCF library.
100+
"""
101+
102+
def __init__(
103+
self,
104+
file_pattern, # type: str
105+
snippet_size, # type: int
106+
compression_type=CompressionTypes.AUTO, # type: str
107+
validate=True, # type: bool
108+
**kwargs # type: **str
109+
):
110+
# type: (...) -> None
111+
"""Initialize the :class:`ReadVcfHeaders` transform.
112+
113+
Args:
114+
file_pattern: The file path to read from either as a single file or a glob
115+
pattern.
116+
snippet_size: The number of lines that should be read from the file.
117+
compression_type: Used to handle compressed input files.
118+
Typical value is :attr:`CompressionTypes.AUTO
119+
<apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the
120+
underlying file_path's extension will be used to detect the compression.
121+
validate: Flag to verify that the files exist during the pipeline creation
122+
time.
123+
"""
124+
super(ReadVcfSnippet, self).__init__(**kwargs)
125+
self._source = _VcfSnippetSource(
126+
file_pattern, snippet_size, compression_type, validate=validate)
127+
128+
def expand(self, pvalue):
129+
return pvalue.pipeline | Read(self._source)
130+
131+

gcp_variant_transforms/beam_io/vcf_snippet_io_test.py

Whitespace-only changes.

gcp_variant_transforms/libs/preprocess_reporter.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
TODO(allieychen): Eventually, it also contains the resource estimation.
2727
2828
Output example (assuming opening in spreedsheet):
29+
Estimated disk usage by Dataflow: 4846.0 GB. The total raw file sizes summed up
30+
to 1231.0 GB.
2931
Header Conflicts
3032
ID Category Conflicts File Paths Proposed Resolution
3133
NS INFO num=1 type=Float file1 num=1 type=Float
@@ -43,6 +45,7 @@
4345
File Path Variant Record Error Message
4446
file 1 rs6 G A 29 PASS NS=3; invalid literal for int() with base 10.
4547
"""
48+
import math
4649

4750
from typing import Dict, List, Optional, Union # pylint: disable=unused-import
4851

@@ -77,6 +80,7 @@ class _HeaderLine(object):
7780
def generate_report(
7881
header_definitions, # type: merge_header_definitions.VcfHeaderDefinitions
7982
file_path, # type: str
83+
disk_usage_estimate, # type: int
8084
resolved_headers=None, # type: vcf_header_io.VcfHeader
8185
inferred_headers=None, # type: vcf_header_io.VcfHeader
8286
malformed_records=None # type: List[vcfio.MalformedVcfRecord]
@@ -97,6 +101,7 @@ def generate_report(
97101
"""
98102
resolved_headers = resolved_headers or vcf_header_io.VcfHeader()
99103
with filesystems.FileSystems.create(file_path) as file_to_write:
104+
_append_disk_usage_estimate_to_report(file_to_write, disk_usage_estimate)
100105
_append_conflicting_headers_to_report(file_to_write, header_definitions,
101106
resolved_headers)
102107
_append_inferred_headers_to_report(file_to_write, inferred_headers)
@@ -272,6 +277,15 @@ def _format_definition(num_value, type_value):
272277
]
273278
return ' '.join(formatted_definition)
274279

280+
def _append_disk_usage_estimate_to_report(file_to_write, disk_usage_estimate):
281+
# type: (file, FileSizeInfo) -> None
282+
if disk_usage_estimate is None:
283+
return
284+
file_to_write.write(
285+
'Estimated disk usage by Dataflow: {} GB. The total raw file sizes summed '
286+
'up to {} GB.\n'.format(
287+
int(math.ceil(disk_usage_estimate.encoded / 1e9)),
288+
int(math.ceil(disk_usage_estimate.raw / 1e9))))
275289

276290
def _append_to_report(file_to_write, error_type, header, contents):
277291
# type: (file, str, str, List[str]) -> None
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# Copyright 2018 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Helper functions for estimating the resources that a vcf_to_bq will require.
15+
16+
Currently, the resource estimator only estimates the disk usage that a Dataflow
17+
pipeline will take along with the `MergeVariants` step, since this can cause
18+
expensive pipeline failures late in the run.
19+
"""
20+
import logging
21+
22+
import apache_beam as beam
23+
from apache_beam import coders
24+
from apache_beam.io.filesystem import CompressionTypes
25+
from apache_beam.io.filesystems import FileSystems
26+
27+
from gcp_variant_transforms.beam_io import vcfio
28+
29+
30+
# TODO(hanjohn): Add unit tests.
31+
32+
def _convert_variant_snippets_to_bytesize(variant):
33+
# type: (vcfio.Variant -> int)
34+
return coders.registry.get_coder(vcfio.Variant).estimate_size(variant)
35+
36+
37+
class SnippetSizeInfo(object):
38+
def __init__(self,
39+
raw_snippet_size, # type: int
40+
encoded_snippet_size, # type: int
41+
):
42+
# type: (...) -> (None)
43+
self.raw = raw_snippet_size
44+
self.encoded = encoded_snippet_size
45+
46+
47+
class FileSizeInfo(object):
48+
def __init__(self, raw_file_size, encoded_file_size=None):
49+
# type: (int, int) -> (None)
50+
self.raw = raw_file_size
51+
self.encoded = encoded_file_size
52+
53+
def calculateEncodedFileSize(self, snippet_size_info):
54+
# type: (SnippetSizeInfo) -> (None)
55+
"""Estimate a VCF file's encoded size based on snippet analysis.
56+
57+
Given the raw_file_size and measurements of several VCF lines from the file,
58+
estimate how much disk the file will take after expansion due to encoding
59+
lines as `vcfio.Variant` objects. The encoded_snippet_size will be set as
60+
`self.encoded`.
61+
62+
This is a simple ratio problem, solving for encoded_snippet_size which is
63+
the only unknown:
64+
encoded_snippet_size / raw_snippet_size = encoded_file_size / raw_file_size
65+
"""
66+
if snippet_size_info.raw == 0:
67+
logging.error("VCF file {} reported with 0 well-formed variant lines; "
68+
"its contribution to disk resource usage will be "
69+
"ignored.".format(self.name))
70+
self.encoded = 0
71+
self.raw = 0
72+
else:
73+
self.encoded = (self.raw * snippet_size_info.encoded /
74+
snippet_size_info.raw)
75+
76+
77+
def measure_variant_size(element):
78+
# type: (Tuple[str, str, vcfio.Variant]) -> (Tuple[str, SnippetSizeInfo])
79+
"""Measure the lengths of the raw and encoded representations of a Variant.
80+
81+
Given a PTable mapping file_paths to the raw (bytestring) and vcfio.Variant-
82+
encoded representations of a Variant line, have the output PTable instead map
83+
from the file_paths to a Tuple with the (raw, encoded) representation sizes.
84+
85+
The file_path keys are not expected to be unique.
86+
"""
87+
file_path, raw_variant, encoded_variant = element
88+
encoded_variant_size = _convert_variant_snippets_to_bytesize(encoded_variant)
89+
raw_variant_size = len(raw_variant)
90+
return file_path, SnippetSizeInfo(raw_variant_size, encoded_variant_size)
91+
92+
93+
def estimate_file_encoded_size(element):
94+
# type: (Tuple[str, Dict[str, Object]]) -> (Tuple[str, FileSizeInfo])
95+
file_name, metrics = element
96+
file_size_info = metrics['whole_file_raw_size'][0] # type: FileSizeInfo
97+
snippet_size_info = metrics['snippet_stats'][0] # type: SnippetSizeInfo
98+
99+
# Assume that the ratio of encoded size to raw disk size is roughly the same
100+
# throughout the file compared to the first several lines.
101+
file_size_info.calculateEncodedFileSize(snippet_size_info)
102+
103+
logging.debug("File %s has raw file size %d, raw snippet size %d, encoded "
104+
"size %d. Estimated encoded file size: %d",
105+
file_name, file_size_info.raw, snippet_size_info.raw,
106+
snippet_size_info.encoded, file_size_info.encoded)
107+
return file_name, file_size_info
108+
109+
def get_file_sizes(input_pattern):
110+
# type: (str) -> (List[FileSizeInfo])
111+
match_results = FileSystems.match([input_pattern])
112+
file_sizes = []
113+
for match in match_results:
114+
for file_metadata in match.metadata_list:
115+
compression_type = CompressionTypes.detect_compression_type(
116+
file_metadata.path)
117+
if compression_type != CompressionTypes.UNCOMPRESSED:
118+
logging.error("VCF file {} is compressed; disk requirement estimator "
119+
"will not be accurate.",
120+
file_metadata.path)
121+
122+
file_sizes.append((file_metadata.path,
123+
FileSizeInfo(file_metadata.size_in_bytes),))
124+
return file_sizes
125+
126+
127+
class SnippetSizeInfoSumFn(beam.CombineFn):
128+
"""Combiner Function to sum up the size fields of SnippetSizeInfos.
129+
130+
Example: [SnippetSizeInfo(a, b), SnippetSizeInfo(c, d)] ->
131+
SnippetSizeInfo(a+c, b+d)
132+
"""
133+
def create_accumulator(self):
134+
# type: (None) -> (Tuple[int, int])
135+
return (0, 0) # (raw, encoded) sums
136+
137+
def add_input(self, sums, input):
138+
# type: (Tuple[int, int], SnippetSizeInfo) -> (Tuple[int, int])
139+
return sums[0] + input.raw, sums[1] + input.encoded
140+
141+
def merge_accumulators(self, accumulators):
142+
# type: (Iterable[Tuple[int, int]]) -> (Tuple[int, int])
143+
first, second = zip(*accumulators)
144+
return sum(first), sum(second)
145+
146+
def extract_output(self, sums):
147+
# type: (Tuple[int, int]) -> (SnippetSizeInfo)
148+
return SnippetSizeInfo(*sums)
149+
150+
151+
class FileSizeInfoSumFn(beam.CombineFn):
152+
"""Combiner Function to sum up the size fields of Tuple[str, FileSizeInfo]s.
153+
154+
Unlike SnippetSizeInfoSumFn, the input is a PTable mapping str to
155+
FileSizeInfo, so the input is a tuple with the FileSizeInfos as the second
156+
field. The output strips out the str key which represents the file path.
157+
158+
Example: [('/path/a', FileSizeInfo(a, b)), ('/path/b;, FileSizeInfo(c, d))] ->
159+
FileSizeInfo(a+c, b+d)
160+
"""
161+
def create_accumulator(self):
162+
# type: (None) -> (Tuple[int, int])
163+
return (0, 0) # (raw, encoded) sums
164+
165+
def add_input(self, raw_encoded, input):
166+
# type: (Tuple[int, int], Tuple[str, FileSizeInfo]) -> (Tuple[int, int])
167+
raw, encoded = raw_encoded
168+
_, file_size_info = input
169+
return raw + file_size_info.raw, encoded + file_size_info.encoded
170+
171+
def merge_accumulators(self, accumulators):
172+
# type: (Iterable[Tuple[int, int]]) -> (Tuple[int, int])
173+
raw, encoded = zip(*accumulators)
174+
return sum(raw), sum(encoded)
175+
176+
def extract_output(self, raw_encoded):
177+
# type: (Tuple[int, int]) -> (FileSizeInfo)
178+
raw, encoded = raw_encoded
179+
return FileSizeInfo(raw, encoded)

gcp_variant_transforms/options/variant_transform_options.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,13 @@ def add_arguments(self, parser):
424424
help=('The full path of the resolved headers. The file will not be'
425425
'generated if unspecified. Otherwise, please provide a local '
426426
'path if run locally, or a cloud path if run on Dataflow.'))
427+
parser.add_argument(
428+
'--estimate_disk_usage',
429+
type='bool', default=False, nargs='?', const=True,
430+
help=('By default, disk resource usage will not be estimated.'
431+
'If true, the preprocessor will estimate the maximum disk usage '
432+
'consumed at any step in the pipeline, which could lead to '
433+
'out-of-disk errors at a shuffle step e.g. MergeVariants.'))
427434

428435
class PartitionOptions(VariantTransformsOptions):
429436
"""Options for partitioning Variant records."""

0 commit comments

Comments
 (0)