Skip to content

Commit 423fab2

Browse files
authored
Merge pull request #694 from aniket486/main
Add a new script to run BigQuery queries with python client
2 parents e187508 + bd40be7 commit 423fab2

File tree

6 files changed

+152
-72
lines changed

6 files changed

+152
-72
lines changed

bigquery/README.md

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,41 @@
1-
As of 2025, Google Bigquery allow publishing benchmark results, which was not the case earlier.
2-
3-
It's very difficult to find, how to create a database.
4-
Databases are named "datasets". You need to press on `` near project.
1+
Download Google Cloud CLI and configure your project settings using the commands below.
2+
You can skip this step if you are using [Cloud shell](https://docs.cloud.google.com/shell/docs/launching-cloud-shell) which already comes with gcloud preinstalled:
3+
```
4+
curl https://sdk.cloud.google.com | bash
5+
exec -l $SHELL
6+
gcloud init
7+
```
58

6-
Create dataset `test`.
7-
Go to the query editor and paste the contents of `create.sql`.
8-
It will take two seconds to create a table.
9+
Enable BigQuery permissions for this project if they haven't enabled already:
10+
```
11+
# 1. Store the active project ID and authenticated email in variables for convenience
12+
export PROJECT_ID=$(gcloud config get-value project)
13+
export USER_EMAIL=$(gcloud config get-value account)
14+
15+
# 2. Grant the BigQuery User role (Fixes datasets.create and jobs.create)
16+
gcloud projects add-iam-policy-binding $PROJECT_ID \
17+
--member="user:$USER_EMAIL" \
18+
--role="roles/bigquery.user"
19+
```
920

10-
Download Google Cloud CLI:
21+
Create the dataset and table in BigQuery:
1122
```
12-
wget --continue --progress=dot:giga https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz
13-
tar -xf google-cloud-cli-linux-x86_64.tar.gz
14-
./google-cloud-sdk/install.sh
15-
source .bashrc
16-
./google-cloud-sdk/bin/gcloud init
23+
bq mk --dataset test
24+
25+
bq query --use_legacy_sql=false < create.sql
1726
```
1827

19-
Load the data:
28+
Load the data in the table:
2029
```
2130
wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.csv.gz'
22-
gzip -d -f hits.csv.gz
2331
32+
# No need to unzip, BigQuery can load from GZIP compressed CSV file.:
2433
echo -n "Load time: "
25-
command time -f '%e' bq load --source_format CSV --allow_quoted_newlines=1 test.hits hits.csv
34+
command time -f '%e' bq load --source_format CSV --allow_quoted_newlines=1 test.hits hits.csv.gz
2635
```
2736

2837
Run the benchmark:
29-
3038
```
31-
./run.sh 2>&1 | tee log.txt
32-
33-
cat log.txt |
34-
grep -P '^real|^Error' |
35-
sed -r -e 's/^Error.*$/null/; s/^real\s*([0-9.]+)m([0-9.]+)s$/\1 \2/' |
36-
awk '{ if ($2 != "") { print $1 * 60 + $2 } else { print $1 } }' |
37-
awk '{ if ($1 == "null") { skip = 1 } else { if (i % 3 == 0) { printf "[" }; printf skip ? "null" : $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; skip = 0; } }'
39+
pip install google-cloud-bigquery
40+
python3 run_queries.py > results.txt 2> log.txt
3841
```

bigquery/create.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
3+
bq mk --dataset test
4+
5+
bq query --use_legacy_sql=false < create.sql

bigquery/create.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,6 @@ CREATE TABLE test.hits
104104
HasGCLID SMALLINT NOT NULL,
105105
RefererHash BIGINT NOT NULL,
106106
URLHash BIGINT NOT NULL,
107-
CLID INTEGER NOT NULL
108-
);
107+
CLID INTEGER NOT NULL,
108+
)
109+
CLUSTER BY CounterID, EventDate, UserID, EventTime;

bigquery/queries.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ SELECT SearchPhrase FROM test.hits WHERE SearchPhrase <> '' ORDER BY EventTime L
2626
SELECT SearchPhrase FROM test.hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
2727
SELECT SearchPhrase FROM test.hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
2828
SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM test.hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29-
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM test.hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\\.)?([^/]+)/.*$', '\\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM test.hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
3030
SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM test.hits;
3131
SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM test.hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
3232
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM test.hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;

bigquery/results/result.json

Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"system": "Bigquery",
3-
"date": "2025-04-09",
3+
"date": "2025-10-28",
44
"machine": "serverless",
55
"cluster_size": "serverless",
66
"proprietary": "yes",
@@ -10,52 +10,52 @@
1010

1111
"tags": ["serverless", "column-oriented", "gcp", "managed"],
1212

13-
"load_time": 1146,
13+
"load_time": 776.91,
1414
"data_size": 8760000000,
1515

1616
"result": [
17-
[4.862,4.001,3.921],
18-
[4.268,4.113,4.467],
19-
[4.341,4.15,4.219],
20-
[4.124,3.996,4.337],
21-
[4.553,4.36,4.349],
22-
[4.565,4.4,4.661],
23-
[4.089,4.132,3.974],
24-
[4.514,4.296,4.312],
25-
[6.183,6.155,4.557],
26-
[6.068,6.106,6.259],
27-
[4.109,4.082,4.165],
28-
[4.24,3.981,4.054],
29-
[4.295,4.301,4.283],
30-
[6.03,6.079,6.094],
31-
[4.383,4.399,4.218],
32-
[4.304,4.23,4.189],
33-
[4.849,4.86,4.62],
34-
[4.309,4.371,4.393],
35-
[6.096,6.109,6.071],
36-
[3.838,3.89,3.938],
37-
[4.249,4.037,4.136],
38-
[4.337,4.196,4.264],
39-
[4.493,4.603,4.435],
40-
[6.125,4.667,4.559],
41-
[4.039,4.039,3.942],
42-
[3.903,4.239,4.003],
43-
[4.013,4.108,4.073],
44-
[4.524,4.474,4.498],
45-
[null,null,null],
46-
[4.866,4.862,6.063],
47-
[4.271,4.403,4.34],
48-
[4.39,4.314,4.566],
49-
[7.233,7.322,7.241],
50-
[7.39,7.382,7.298],
51-
[6.05,6.084,6.362],
52-
[4.31,4.222,4.254],
53-
[4.181,4.003,3.95],
54-
[3.98,3.988,3.982],
55-
[4.017,4.004,3.987],
56-
[4.334,4.322,4.445],
57-
[4.126,3.853,3.982],
58-
[4.214,3.931,3.921],
59-
[4.033,3.913,3.866]
17+
[0.383933,0.402355,0.370758],
18+
[0.334439,0.433776,0.416341],
19+
[0.469506,0.359557,0.386433],
20+
[0.491417,0.333208,0.4758],
21+
[0.552464,0.652322,0.555889],
22+
[0.581302,0.603089,0.674999],
23+
[1.087835,0.639649,0.360542],
24+
[0.438221,0.759105,0.497731],
25+
[0.702109,0.712533,0.678109],
26+
[0.857454,0.968303,0.995039],
27+
[0.547042,0.479513,0.475109],
28+
[0.547026,0.549529,0.614708],
29+
[0.686315,0.580551,0.630673],
30+
[1.792573,2.034019,1.845895],
31+
[0.610674,0.677655,0.643796],
32+
[0.580303,0.729024,0.622044],
33+
[0.760401,0.809858,0.822725],
34+
[0.721757,0.611165,0.744566],
35+
[1.49368,1.372045,1.498892],
36+
[0.363523,0.383959,0.366856],
37+
[0.625735,0.49802,0.473233],
38+
[0.513777,0.508772,0.527258],
39+
[0.895406,0.874879,0.799704],
40+
[0.909036,0.679151,0.730413],
41+
[0.358434,0.509104,0.467827],
42+
[0.421586,0.428603,0.33761],
43+
[0.54752,0.364919,0.444499],
44+
[0.691434,0.674469,0.930067],
45+
[1.143579,1.034013,1.105913],
46+
[0.569294,0.444362,0.463864],
47+
[0.517151,0.53565,0.523663],
48+
[0.56208,0.573,0.543899],
49+
[1.409102,1.116484,1.295522],
50+
[1.413902,1.346194,1.406088],
51+
[1.068575,0.985308,1.194028],
52+
[0.781501,0.524615,0.664192],
53+
[0.678144,0.666519,0.548661],
54+
[0.477265,0.445584,0.469621],
55+
[0.554599,0.530927,0.551336],
56+
[0.777017,0.696796,0.810055],
57+
[0.427604,0.43113,0.449339],
58+
[0.434927,0.407959,0.435918],
59+
[0.478507,0.425838,0.541504]
6060
]
6161
}

bigquery/run_queries.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
from google.cloud import bigquery
2+
from google.cloud.bigquery.enums import JobCreationMode
3+
4+
import sys
5+
from typing import TextIO, Any
6+
from datetime import datetime
7+
8+
def log(*objects: Any, sep: str = ' ', end: str = '\n', file: TextIO = sys.stderr, severity: str = 'INFO') -> None:
9+
"""
10+
Mimics the built-in print() function signature but prepends a
11+
timestamp and a configurable severity level to the output.
12+
13+
Args:
14+
*objects: The objects to be printed (converted to strings).
15+
sep (str): Separator inserted between values, default a space.
16+
end (str): String appended after the last value, default a newline.
17+
file (TextIO): Object with a write(string) method, default sys.stdout.
18+
severity (str): The log level (e.g., "INFO", "WARNING", "ERROR").
19+
"""
20+
# 1. Prepare the standard print content
21+
# Use an f-string to join the objects with the specified separator
22+
message = sep.join(str(obj) for obj in objects)
23+
24+
# 2. Prepare the log prefix
25+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
26+
prefix = f"[{timestamp}] [{severity.upper()}]: "
27+
28+
# 3. Combine the prefix and the message
29+
full_message = prefix + message
30+
31+
# 4. Use the file.write method to output the content
32+
# The 'end' argument is handled explicitly here
33+
file.write(full_message + end)
34+
35+
# Ensure the buffer is flushed (important for file/stream output)
36+
if file is not sys.stdout and file is not sys.stderr:
37+
file.flush()
38+
39+
40+
job_config = bigquery.QueryJobConfig()
41+
job_config.use_query_cache = False
42+
client = bigquery.Client(
43+
default_job_creation_mode=JobCreationMode.JOB_CREATION_OPTIONAL
44+
)
45+
46+
file = open('queries.sql', 'r')
47+
TRIES = 3
48+
query_num = 0
49+
for query in file:
50+
query = query.strip()
51+
print("[", end='')
52+
query_num = query_num + 1
53+
for i in range(TRIES):
54+
log(f"[q{query_num}: {i}]: {query}")
55+
try:
56+
client_start_time = datetime.now()
57+
results = client.query_and_wait(query, job_config=job_config)
58+
client_end_time = datetime.now()
59+
60+
client_time = client_end_time - client_start_time
61+
client_time_secs = client_time.total_seconds()
62+
endstr = "],\n" if i == 2 else ","
63+
print(f"{client_time_secs}", end=endstr)
64+
65+
log(f"Job ID: **{results.job_id}**")
66+
log(f"Query ID: **{results.query_id}**")
67+
log(f"Client time: **{client_time}**")
68+
69+
except Exception as e:
70+
log(f"Job failed with error: {e}", severity="ERROR")
71+

0 commit comments

Comments
 (0)