Skip to content

Commit e73e2d4

Browse files
rizaonImpala Public Jenkins
authored and
Impala Public Jenkins
committed
IMPALA-13864: Implement ImpylaHS2ResultSet.exec_summary
This patch implement building exec summary table for ImpylaHS2Connection. It adds fetch_exec_summary argument in ImpalaConnection.execute(). If this argument is True, an exec summary table will be added into the returned result object. fetch_exec_summary is also implemented for BeeswaxConnection. Thus, BeeswaxConnection will not fetch exec summary by default all the time. Tests that validate exec summary table is updated to set fetch_exec_summary=True and migrated to test against hs2 protocol. Change TestExecutorGroup._set_query_options() to do query option setting through hs2_client iconfig instead of SET query. Some flake8 issues are addressed as well. Move build_exec_summary_table to separate exec_summary.py file. Tweak it a bit to return early if given TExecSummary is empty. Fixed bug in ImpalaBeeswaxClient.fetch_results() where fetch will not happen at all if discard_result argument is True. Testing: - Run and pass affected tests locally. Change-Id: I7d88f78e58eeda29ce21e7828884c7a129d7efe6 Reviewed-on: http://gerrit.cloudera.org:8080/22626 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]>
1 parent 6f94971 commit e73e2d4

File tree

10 files changed

+344
-244
lines changed

10 files changed

+344
-244
lines changed

shell/exec_summary.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one
5+
# or more contributor license agreements. See the NOTICE file
6+
# distributed with this work for additional information
7+
# regarding copyright ownership. The ASF licenses this file
8+
# to you under the Apache License, Version 2.0 (the
9+
# "License"); you may not use this file except in compliance
10+
# with the License. You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing,
15+
# software distributed under the License is distributed on an
16+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
# KIND, either express or implied. See the License for the
18+
# specific language governing permissions and limitations
19+
# under the License.
20+
21+
from ExecStats.ttypes import TExecStats
22+
23+
24+
def build_exec_summary_table(summary, idx, indent_level, new_indent_level, output,
25+
is_prettyprint=True, separate_prefix_column=False):
26+
"""Direct translation of Coordinator::PrintExecSummary() to recursively build a list
27+
of rows of summary statistics, one per exec node
28+
29+
summary: the TExecSummary object that contains all the summary data
30+
31+
idx: the index of the node to print
32+
33+
indent_level: the number of spaces to print before writing the node's label, to give
34+
the appearance of a tree. The 0th child of a node has the same indent_level as its
35+
parent. All other children have an indent_level of one greater than their parent.
36+
37+
new_indent_level: If true, this indent level is different from the previous row's.
38+
39+
output: the list of rows into which to append the rows produced for this node and its
40+
children.
41+
42+
is_prettyprint: Optional. If True, print time, units, and bytes columns in pretty
43+
printed format.
44+
45+
separate_prefix_column: Optional. If True, the prefix and operator name will be
46+
returned as separate column. Otherwise, prefix and operater name will be concatenated
47+
into single column.
48+
49+
Returns the index of the next exec node in summary.exec_nodes that should be
50+
processed, used internally to this method only.
51+
"""
52+
if not summary.nodes:
53+
# Summary nodes is empty or None. Nothing to build.
54+
return
55+
assert idx < len(summary.nodes), (
56+
"Index ({0}) must be less than exec summary count ({1})").format(
57+
idx, len(summary.nodes))
58+
59+
attrs = ["latency_ns", "cpu_time_ns", "cardinality", "memory_used"]
60+
61+
# Initialise aggregate and maximum stats
62+
agg_stats, max_stats = TExecStats(), TExecStats()
63+
for attr in attrs:
64+
setattr(agg_stats, attr, 0)
65+
setattr(max_stats, attr, 0)
66+
67+
node = summary.nodes[idx]
68+
instances = 0
69+
if node.exec_stats:
70+
# exec_stats is not None or an empty list.
71+
instances = len(node.exec_stats)
72+
for stats in node.exec_stats:
73+
for attr in attrs:
74+
val = getattr(stats, attr)
75+
if val is not None:
76+
setattr(agg_stats, attr, getattr(agg_stats, attr) + val)
77+
setattr(max_stats, attr, max(getattr(max_stats, attr), val))
78+
avg_time = agg_stats.latency_ns / instances
79+
else:
80+
avg_time = 0
81+
82+
is_sink = node.node_id == -1
83+
# If the node is a broadcast-receiving exchange node, the cardinality of rows produced
84+
# is the max over all instances (which should all have received the same number of
85+
# rows). Otherwise, the cardinality is the sum over all instances which process
86+
# disjoint partitions.
87+
if is_sink:
88+
cardinality = -1
89+
elif node.is_broadcast:
90+
cardinality = max_stats.cardinality
91+
else:
92+
cardinality = agg_stats.cardinality
93+
94+
est_stats = node.estimated_stats
95+
label_prefix = ""
96+
if indent_level > 0:
97+
label_prefix = "|"
98+
label_prefix += " |" * (indent_level - 1)
99+
if new_indent_level:
100+
label_prefix += "--"
101+
else:
102+
label_prefix += " "
103+
104+
def prettyprint(val, units, divisor):
105+
for unit in units:
106+
if val < divisor:
107+
if unit == units[0]:
108+
return "%d%s" % (val, unit)
109+
else:
110+
return "%3.2f%s" % (val, unit)
111+
val /= divisor
112+
113+
def prettyprint_bytes(byte_val):
114+
return prettyprint(byte_val, [' B', ' KB', ' MB', ' GB', ' TB'], 1024.0)
115+
116+
def prettyprint_units(unit_val):
117+
return prettyprint(unit_val, ["", "K", "M", "B"], 1000.0)
118+
119+
def prettyprint_time(time_val):
120+
return prettyprint(time_val, ["ns", "us", "ms", "s"], 1000.0)
121+
122+
latency = max_stats.latency_ns
123+
cardinality_est = est_stats.cardinality
124+
memory_used = max_stats.memory_used
125+
memory_est = est_stats.memory_used
126+
if (is_prettyprint):
127+
avg_time = prettyprint_time(avg_time)
128+
latency = prettyprint_time(latency)
129+
cardinality = "" if is_sink else prettyprint_units(cardinality)
130+
cardinality_est = "" if is_sink else prettyprint_units(cardinality_est)
131+
memory_used = prettyprint_bytes(memory_used)
132+
memory_est = prettyprint_bytes(memory_est)
133+
134+
row = list()
135+
if separate_prefix_column:
136+
row.append(label_prefix)
137+
row.append(node.label)
138+
else:
139+
row.append(label_prefix + node.label)
140+
141+
row.extend([
142+
node.num_hosts,
143+
instances,
144+
avg_time,
145+
latency,
146+
cardinality,
147+
cardinality_est,
148+
memory_used,
149+
memory_est,
150+
node.label_detail])
151+
152+
output.append(row)
153+
try:
154+
sender_idx = summary.exch_to_sender_map[idx]
155+
# This is an exchange node or a join node with a separate builder, so the source
156+
# is a fragment root, and should be printed next.
157+
sender_indent_level = indent_level + node.num_children
158+
sender_new_indent_level = node.num_children > 0
159+
build_exec_summary_table(summary, sender_idx, sender_indent_level,
160+
sender_new_indent_level, output, is_prettyprint,
161+
separate_prefix_column)
162+
except (KeyError, TypeError):
163+
# Fall through if idx not in map, or if exch_to_sender_map itself is not set
164+
pass
165+
166+
idx += 1
167+
if node.num_children > 0:
168+
first_child_output = []
169+
idx = build_exec_summary_table(summary, idx, indent_level, False, first_child_output,
170+
is_prettyprint, separate_prefix_column)
171+
for _ in range(1, node.num_children):
172+
# All other children are indented
173+
idx = build_exec_summary_table(summary, idx, indent_level + 1, True, output,
174+
is_prettyprint, separate_prefix_column)
175+
output += first_child_output
176+
return idx

shell/impala_client.py

Lines changed: 1 addition & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535

3636
from beeswaxd import BeeswaxService
3737
from beeswaxd.BeeswaxService import QueryState
38-
from ExecStats.ttypes import TExecStats
3938
from ImpalaService import ImpalaService, ImpalaHiveServer2Service
4039
from ImpalaService.ImpalaHiveServer2Service import (TGetRuntimeProfileReq,
4140
TGetExecSummaryReq, TPingImpalaHS2ServiceReq, TCloseImpalaOperationReq)
@@ -46,6 +45,7 @@
4645
TOperationState, TFetchResultsReq, TFetchOrientation, TGetLogReq,
4746
TGetResultSetMetadataReq, TTypeId, TCancelOperationReq, TCloseOperationReq)
4847
from ImpalaHttpClient import ImpalaHttpClient
48+
from exec_summary import build_exec_summary_table
4949
from kerberos_util import get_kerb_host_from_kerberos_host_fqdn
5050
from thrift.protocol import TBinaryProtocol
5151
from thrift_sasl import TSaslClientTransport
@@ -110,157 +110,6 @@ def utf8_encode_if_needed(val):
110110
RPC_EXCEPTION_SERVER = "SERVER_ERROR"
111111

112112

113-
def build_exec_summary_table(summary, idx, indent_level, new_indent_level, output,
114-
is_prettyprint=True, separate_prefix_column=False):
115-
"""Direct translation of Coordinator::PrintExecSummary() to recursively build a list
116-
of rows of summary statistics, one per exec node
117-
118-
summary: the TExecSummary object that contains all the summary data
119-
120-
idx: the index of the node to print
121-
122-
indent_level: the number of spaces to print before writing the node's label, to give
123-
the appearance of a tree. The 0th child of a node has the same indent_level as its
124-
parent. All other children have an indent_level of one greater than their parent.
125-
126-
new_indent_level: If true, this indent level is different from the previous row's.
127-
128-
output: the list of rows into which to append the rows produced for this node and its
129-
children.
130-
131-
is_prettyprint: Optional. If True, print time, units, and bytes columns in pretty
132-
printed format.
133-
134-
separate_prefix_column: Optional. If True, the prefix and operator name will be
135-
returned as separate column. Otherwise, prefix and operater name will be concatenated
136-
into single column.
137-
138-
Returns the index of the next exec node in summary.exec_nodes that should be
139-
processed, used internally to this method only.
140-
"""
141-
attrs = ["latency_ns", "cpu_time_ns", "cardinality", "memory_used"]
142-
143-
# Initialise aggregate and maximum stats
144-
agg_stats, max_stats = TExecStats(), TExecStats()
145-
for attr in attrs:
146-
setattr(agg_stats, attr, 0)
147-
setattr(max_stats, attr, 0)
148-
149-
node = summary.nodes[idx]
150-
if node.exec_stats is not None:
151-
for stats in node.exec_stats:
152-
for attr in attrs:
153-
val = getattr(stats, attr)
154-
if val is not None:
155-
setattr(agg_stats, attr, getattr(agg_stats, attr) + val)
156-
setattr(max_stats, attr, max(getattr(max_stats, attr), val))
157-
158-
if node.exec_stats is not None and node.exec_stats:
159-
avg_time = agg_stats.latency_ns / len(node.exec_stats)
160-
else:
161-
avg_time = 0
162-
163-
is_sink = node.node_id == -1
164-
# If the node is a broadcast-receiving exchange node, the cardinality of rows produced
165-
# is the max over all instances (which should all have received the same number of
166-
# rows). Otherwise, the cardinality is the sum over all instances which process
167-
# disjoint partitions.
168-
if is_sink:
169-
cardinality = -1
170-
elif node.is_broadcast:
171-
cardinality = max_stats.cardinality
172-
else:
173-
cardinality = agg_stats.cardinality
174-
175-
est_stats = node.estimated_stats
176-
label_prefix = ""
177-
if indent_level > 0:
178-
label_prefix = "|"
179-
label_prefix += " |" * (indent_level - 1)
180-
if new_indent_level:
181-
label_prefix += "--"
182-
else:
183-
label_prefix += " "
184-
185-
def prettyprint(val, units, divisor):
186-
for unit in units:
187-
if val < divisor:
188-
if unit == units[0]:
189-
return "%d%s" % (val, unit)
190-
else:
191-
return "%3.2f%s" % (val, unit)
192-
val /= divisor
193-
194-
def prettyprint_bytes(byte_val):
195-
return prettyprint(byte_val, [' B', ' KB', ' MB', ' GB', ' TB'], 1024.0)
196-
197-
def prettyprint_units(unit_val):
198-
return prettyprint(unit_val, ["", "K", "M", "B"], 1000.0)
199-
200-
def prettyprint_time(time_val):
201-
return prettyprint(time_val, ["ns", "us", "ms", "s"], 1000.0)
202-
203-
instances = 0
204-
if node.exec_stats is not None:
205-
instances = len(node.exec_stats)
206-
latency = max_stats.latency_ns
207-
cardinality_est = est_stats.cardinality
208-
memory_used = max_stats.memory_used
209-
memory_est = est_stats.memory_used
210-
if (is_prettyprint):
211-
avg_time = prettyprint_time(avg_time)
212-
latency = prettyprint_time(latency)
213-
cardinality = "" if is_sink else prettyprint_units(cardinality)
214-
cardinality_est = "" if is_sink else prettyprint_units(cardinality_est)
215-
memory_used = prettyprint_bytes(memory_used)
216-
memory_est = prettyprint_bytes(memory_est)
217-
218-
row = list()
219-
if separate_prefix_column:
220-
row.append(label_prefix)
221-
row.append(node.label)
222-
else:
223-
row.append(label_prefix + node.label)
224-
225-
row.extend([
226-
node.num_hosts,
227-
instances,
228-
avg_time,
229-
latency,
230-
cardinality,
231-
cardinality_est,
232-
memory_used,
233-
memory_est,
234-
node.label_detail])
235-
236-
output.append(row)
237-
try:
238-
sender_idx = summary.exch_to_sender_map[idx]
239-
# This is an exchange node or a join node with a separate builder, so the source
240-
# is a fragment root, and should be printed next.
241-
sender_indent_level = indent_level + node.num_children
242-
sender_new_indent_level = node.num_children > 0
243-
build_exec_summary_table(summary, sender_idx, sender_indent_level,
244-
sender_new_indent_level, output, is_prettyprint,
245-
separate_prefix_column)
246-
except (KeyError, TypeError):
247-
# Fall through if idx not in map, or if exch_to_sender_map itself is not set
248-
pass
249-
250-
idx += 1
251-
if node.num_children > 0:
252-
first_child_output = []
253-
idx = build_exec_summary_table(summary, idx, indent_level, False, first_child_output,
254-
is_prettyprint, separate_prefix_column)
255-
for child_idx in xrange(1, node.num_children):
256-
# All other children are indented (we only have 0, 1 or 2 children for every exec
257-
# node at the moment)
258-
idx = build_exec_summary_table(summary, idx, indent_level + 1, True, output,
259-
is_prettyprint, separate_prefix_column)
260-
output += first_child_output
261-
return idx
262-
263-
264113
class QueryOptionLevels:
265114
"""These are the levels used when displaying query options.
266115
The values correspond to the ones in TQueryOptionLevel"""

shell/packaging/make_python_package.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ assemble_package_files() {
6363
cp "${SHELL_HOME}/kerberos_util.py" "${MODULE_LIB_DIR}"
6464
cp "${SHELL_HOME}/value_converter.py" "${MODULE_LIB_DIR}"
6565
cp "${SHELL_HOME}/thrift_printer.py" "${MODULE_LIB_DIR}"
66+
cp "${SHELL_HOME}/exec_summary.py" "${MODULE_LIB_DIR}"
6667

6768
cp "${SHELL_HOME}/packaging/README.md" "${PACKAGE_DIR}"
6869
cp "${SHELL_HOME}/packaging/MANIFEST.in" "${PACKAGE_DIR}"

0 commit comments

Comments
 (0)