Skip to content

Commit 00f847d

Browse files
authored
Implement hierarchical CSV splitting functionality
Adds the CSV splitting code: This script splits a CSV file into multiple files based on unique values in the 'Cluster', 'Type', and 'Indicator' columns. It also cleans and converts data types for specific columns before performing the split.
1 parent 856dd52 commit 00f847d

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed

split_csv.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import pandas as pd
2+
import os
3+
import sys
4+
from pathlib import Path
5+
6+
def split_csv_hierarchically(input_file, output_dir='output'):
7+
"""
8+
Split a CSV file by Cluster, then Type, then Indicator.
9+
Each split removes the splitting column and adds it to the filename.
10+
11+
Args:
12+
input_file: Path to the input CSV file
13+
output_dir: Directory where output files will be saved
14+
"""
15+
# Create output directory if it doesn't exist
16+
Path(output_dir).mkdir(parents=True, exist_ok=True)
17+
18+
# Read the CSV file with low_memory=False to avoid dtype warnings
19+
print(f"Reading {input_file}...")
20+
df = pd.read_csv(input_file, low_memory=False)
21+
22+
# Strip whitespace from all column headers
23+
df.columns = df.columns.str.strip()
24+
print("Stripped whitespace from column headers")
25+
26+
# Convert Year column to integer if it exists, handling missing values
27+
if 'Year' in df.columns:
28+
# Convert to nullable integer type (Int64 instead of int64)
29+
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
30+
df['Year'] = df['Year'].astype('Int64')
31+
print("Converted 'Year' column to integers (preserving empty values)")
32+
33+
# Clean and convert Value column if it exists
34+
if 'Value' in df.columns:
35+
# Remove commas and other non-numeric characters, then convert to number
36+
df['Value'] = df['Value'].astype(str).str.replace(',', '', regex=False)
37+
df['Value'] = df['Value'].str.replace(' ', '', regex=False)
38+
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')
39+
print("Cleaned and converted 'Value' column to numbers")
40+
41+
# Verify required columns exist
42+
required_columns = ['Cluster', 'Type', 'Indicator']
43+
for col in required_columns:
44+
if col not in df.columns:
45+
raise ValueError(f"Column '{col}' not found in CSV. Available columns: {df.columns.tolist()}")
46+
47+
# Get unique values for each splitting column
48+
clusters = df['Cluster'].unique()
49+
50+
print(f"Found {len(clusters)} unique clusters")
51+
52+
# Split by Cluster
53+
for cluster in clusters:
54+
cluster_df = df[df['Cluster'] == cluster].copy()
55+
56+
# Get unique Types for this cluster
57+
types = cluster_df['Type'].unique()
58+
59+
# Split by Type
60+
for type in types:
61+
type_df = cluster_df[cluster_df['Type'] == type].copy()
62+
63+
# Get unique Indicators for this Type
64+
indicators = type_df['Indicator'].unique()
65+
66+
# Split by Indicator
67+
for indicator in indicators:
68+
indicator_df = type_df[type_df['Indicator'] == indicator].copy()
69+
70+
# Remove the splitting columns
71+
indicator_df = indicator_df.drop(columns=['Cluster', 'Type', 'Indicator'])
72+
73+
# Create filename with all three values
74+
# Clean the values to make them filesystem-safe
75+
clean_cluster = str(cluster).replace('/', '_').replace('\\', '_').replace(' ', '_')
76+
clean_type = str(type).replace('/', '_').replace('\\', '_').replace(' ', '_')
77+
clean_indicator = str(indicator).replace('/', '_').replace('\\', '_').replace(' ', '_')
78+
79+
filename = f"{clean_cluster}_{clean_type}_{clean_indicator}.csv"
80+
output_path = os.path.join(output_dir, filename)
81+
82+
# Save the split CSV
83+
indicator_df.to_csv(output_path, index=False)
84+
print(f"Created: {filename} ({len(indicator_df)} rows)")
85+
86+
print(f"\nAll files saved to '{output_dir}' directory")
87+
88+
# Main execution
89+
if __name__ == "__main__":
90+
# Check if file path was provided
91+
if len(sys.argv) < 2:
92+
print("Usage: python split_csv.py <input_file.csv> [output_directory]")
93+
print("\nExamples:")
94+
print(" python split_csv.py mydata.csv")
95+
print(" python split_csv.py mydata.csv split_output")
96+
print(" python split_csv.py /path/to/mydata.csv /path/to/output")
97+
sys.exit(1)
98+
99+
# Get input file from command line argument
100+
input_csv = sys.argv[1]
101+
102+
# Get output directory (optional, defaults to 'output')
103+
output_dir = sys.argv[2] if len(sys.argv) > 2 else 'output'
104+
105+
try:
106+
split_csv_hierarchically(input_csv, output_dir=output_dir)
107+
except FileNotFoundError:
108+
print(f"Error: File '{input_csv}' not found.")
109+
sys.exit(1)
110+
except Exception as e:
111+
print(f"Error: {e}")
112+
sys.exit(1)

0 commit comments

Comments
 (0)