-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathupload_data.py
27 lines (19 loc) · 992 Bytes
/
upload_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import zipfile
import requests
import io
import pandas as pd
from pandas_gbq import to_gbq
# Step 1: Download and extract the raw CSV from the zipped World Bank file (no skipping rows)
def download_and_extract_csv(url):
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))
# Find the main data file in the ZIP (skip metadata files)
target_file = [f for f in z.namelist() if f.startswith('API_') and f.endswith('.csv') and 'Metadata' not in f][0]
# Load full CSV (with headers, no row skipping)
df = pd.read_csv(z.open(target_file), skiprows=4, encoding='latin1', delimiter=',', engine='python')
return df
# Step 2: Upload the full DataFrame to BigQuery, replacing any existing table
def truncate_and_upload(PROJECT_ID, TABLE_FULL_ID, url):
df = download_and_extract_csv(url) # Download and load data
# Replace entire table in BigQuery
to_gbq(df, TABLE_FULL_ID, project_id=PROJECT_ID, if_exists='replace')