Skip to content

⚡️ Speed up function dataframe_merge by 1,247% #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions src/numpy_pandas/dataframe_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,39 @@ def dataframe_merge(
result_data = []
left_cols = list(left.columns)
right_cols = [col for col in right.columns if col != right_on]
# Use numpy arrays for fast row and column access
left_values = left.values
right_values = right.values
left_on_idx = left.columns.get_loc(left_on)
right_on_idx = right.columns.get_loc(right_on)
# Build right_dict mapping join keys to row indices, but using values array for fast access
right_dict = {}
for i in range(len(right)):
key = right.iloc[i][right_on]
# Store right row indexes for each key, as before, but no iloc
for i in range(right_values.shape[0]):
key = right_values[i, right_on_idx]
if key not in right_dict:
right_dict[key] = []
right_dict[key].append(i)
for i in range(len(left)):
left_row = left.iloc[i]
key = left_row[left_on]
# Precompute column indices to avoid repeated get_loc()
left_col_indices = [left.columns.get_loc(col) for col in left_cols]
right_col_indices = [right.columns.get_loc(col) for col in right_cols]
# Outer join
for i in range(left_values.shape[0]):
key = left_values[i, left_on_idx]
if key in right_dict:
left_row_values = left_values[i]
# Compose dict of left row vals only once
left_row_dict = {
col: left_row_values[idx]
for col, idx in zip(left_cols, left_col_indices)
}
for right_idx in right_dict[key]:
right_row = right.iloc[right_idx]
new_row = {}
for col in left_cols:
new_row[col] = left_row[col]
for col in right_cols:
new_row[col] = right_row[col]
result_data.append(new_row)
right_row_values = right_values[right_idx]
# Add right row values except join column
row = left_row_dict.copy()
for col, idx in zip(right_cols, right_col_indices):
row[col] = right_row_values[idx]
result_data.append(row)
return pd.DataFrame(result_data)


Expand All @@ -66,14 +81,17 @@ def pivot_table(

def agg_func(values):
return sum(values) / len(values)

elif aggfunc == "sum":

def agg_func(values):
return sum(values)

elif aggfunc == "count":

def agg_func(values):
return len(values)

else:
raise ValueError(f"Unsupported aggregation function: {aggfunc}")
grouped_data = {}
Expand Down