Skip to content

Meiravg_disk_poc2 #597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ if(VECSIM_BUILD_TESTS)
FetchContent_MakeAvailable(google_benchmark)

add_subdirectory(tests/unit unit_tests)
add_subdirectory(tests/module module_tests)
# add_subdirectory(tests/module module_tests)

if(NOT(USE_ASAN OR USE_MSAN))
add_subdirectory(tests/benchmark benchmark)
endif()
# if(NOT(USE_ASAN OR USE_MSAN))
# add_subdirectory(tests/benchmark benchmark)
# endif()
endif()

add_subdirectory(src/VecSim)
Expand Down
130 changes: 130 additions & 0 deletions create_bm_graphs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
import matplotlib.pyplot as plt
import re

# ---- 1. Process CSV File (Memory Usage) ----
def plot_memory_usage(csv_file, output_file):
df = pd.read_csv(csv_file)
# Convert memory from bytes to GB
df["memory_gb"] = df["memory_bytes"] / (1024**3)

num_entries = 1000
step = 5
df = df.head(num_entries).iloc[::step]

plt.figure(figsize=(30, 20))
plt.scatter(df["time_stamp"], df["memory_gb"], label="Memory Usage (GB)", color='b', marker='o', s=50)
plt.xlabel("Time (Unix Timestamp)")
plt.ylabel("Memory (GB)")
plt.title("Memory Usage Over Time")
plt.legend()
plt.grid()

plt.savefig(output_file) # Save as PNG file
plt.close() # Close the plot to free memory

# ---- 2. Process Log File (Index Size) ----
def parse_log_file(log_file):
pattern = re.compile(r"\[(\d+\.\d+)\] Building (\d+) vectors time: T(\d+\.\d+) seconds")

timestamps = []
index_sizes_m = []

with open(log_file, 'r') as file:
for line in file:
match = pattern.search(line)
if match:
timestamp, index_size, _ = match.groups()
timestamps.append(float(timestamp))
index_sizes_m.append(int(index_size) / 1_000_000) # Convert to millions

return timestamps, index_sizes_m

def plot_index_size(log_file, output_file):
timestamps, index_sizes = parse_log_file(log_file)

plt.figure(figsize=(10, 5))
plt.plot(timestamps, index_sizes, label="Index Size (vectors)", color='r')
plt.xlabel("Time (Unix Timestamp)")
plt.ylabel("Index Size (vectors)")
plt.title("Index Size Over Time")
plt.legend()
plt.grid()

plt.savefig(output_file) # Save as PNG file
plt.close() # Close the plot to free memory

def parse_log_file(log_file):
pattern = re.compile(r"\[(\d+\.\d+)\] Building (\d+) vectors time: T(\d+\.\d+) seconds")
timestamps = []
index_sizes_m = [] # Store index sizes in millions

with open(log_file, 'r') as file:
for line in file:
match = pattern.search(line)
if match:
timestamp, index_size, _ = match.groups()
timestamps.append(float(timestamp))
index_sizes_m.append(int(index_size) / 1_000_000) # Convert to millions

return timestamps, index_sizes_m

def plot_combined(csv_file, log_file, output_file, num_entries=None, step=5):
# Read the CSV file for memory usage
df = pd.read_csv(csv_file)

if num_entries is None:
num_entries = len(df)

# Convert memory from bytes to GB
df["memory_gb"] = df["memory_bytes"] / (1024**3)

# Subtract the first memory value to adjust all memory values
initial_memory = df["memory_gb"].iloc[0]
df["memory_gb"] = df["memory_gb"] - initial_memory


# Select only the first `num_entries` rows and take every `step`-th row
df = df.head(num_entries).iloc[::step]

# Parse the log file for index size data
timestamps, index_sizes_m = parse_log_file(log_file)

# Create the plot with dual y-axes
fig, ax1 = plt.subplots(figsize=(20, 10))

# Plot memory usage on the first y-axis (left)
ax1.set_xlabel("Time (Unix Timestamp)")
ax1.set_ylabel("Memory Usage (GB)", color='b')
ax1.scatter(df["time_stamp"], df["memory_gb"], color='b', marker='o', s=10, label="Memory Usage")
ax1.tick_params(axis='y', labelcolor='b')

# Create a second y-axis for the index size
ax2 = ax1.twinx()
ax2.set_ylabel("Index Size (M vectors)", color='r')
ax2.scatter(timestamps, index_sizes_m, color='r', marker='x', s=10, label="Index Size")
ax2.tick_params(axis='y', labelcolor='r')

# Set the title and legend
plt.title("Memory Usage and Index Size Over Time")
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.grid(True)

# Save the plot to a file
plt.savefig(output_file)
plt.close() # Close the plot to free memory

print(f"Combined plot saved to {output_file}")

# ---- Run the plots ----
result_details = "results_M_60_efC_75_vec_10000000_q_10000_madvise_MADV_DONTNEED_bs_10240"
csv_file = f"results/mem_monitor/{result_details}_pid_1335459_rss_memory_monitor.csv"
log_file = f"results/logs/{result_details}_log.txt"

# plot_memory_usage(csv_file, f"results/graphs/{result_details}_memory_usage.png")
# plot_index_size(log_file, f"results/graphs/{result_details}_index_size.png")
plot_combined(csv_file, log_file, f"results/graphs/{result_details}_combined.png", step = 20)

print("Graphs saved: memory_usage.png, index_size.png")
60 changes: 60 additions & 0 deletions monitor_proc_rss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import psutil
import time
import csv
import argparse

def collect_process_memory(pid, output_file):
""" Continuously log memory usage of a process to a CSV file until it dies. """
with open(output_file, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["time_stamp", "memory_bytes", "memory_current", "memory_high"])

i = 0
print_interval = 600 # Print every 10 min
while True:
try:
proc = psutil.Process(pid)
memory = proc.memory_info().rss # Get memory in bytes
timestamp = time.time() # Unix timestamp (float)

# Read memory.current and memory.event::high from cgroup
memory_current = get_memory_current(pid)
memory_high = get_memory_high(pid)

writer.writerow([timestamp, memory, memory_current, memory_high])
if i % print_interval == 0:
print(f"{timestamp}, rss: {memory} bytes, {(memory / 1024 / 1024 / 1024):.4f} GB, memory.current: {(memory_current / 1024 / 1024 / 1024):.4f} GB, memory.event::high: {memory_high}")

time.sleep(1) # Adjust sampling interval if needed
i += 1
except psutil.NoSuchProcess:
print(f"Process {pid} has ended.")
break

def get_memory_current(pid):
""" Read memory.current from cgroup for the specified process. """
with open(f"/sys/fs/cgroup/limited_process/memory.current", "r") as f:
memory_current = f.read().strip()
return int(memory_current) # Return value in bytes

def get_memory_high(pid):
""" Read memory.event::high from cgroup for the specified process. """
with open(f"/sys/fs/cgroup/limited_process/memory.events", "r") as f:
for line in f:
if "high" in line:
memory_high = line.strip().split()[1]
return int(memory_high) # Return value in bytes

def generate_file_name(M, efC, num_vectors, num_queries, madvise, block_size, process_limit_high):
return f"results_M_{M}_efC_{efC}_vec_{num_vectors}_q_{num_queries}_madvise_{madvise}_bs_{block_size}_mem_limit_{process_limit_high}"

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Monitor RSS memory usage of a process.")
parser.add_argument("pid", type=int, help="PID of the process to monitor")
args = parser.parse_args()

pid = args.pid # Get the PID from the command line argument
run_name = generate_file_name(M=60, efC=75, num_vectors=10_000_000, num_queries=10_000, madvise="None", block_size=10_240, process_limit_high="55G")
output_file = f"{run_name}_pid_{pid}_rss_memory_monitor.csv"
print("Start collecting memory usage for process", pid)
collect_process_memory(pid, output_file)
137 changes: 137 additions & 0 deletions parse_index_log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import re
import numpy as np
import matplotlib.pyplot as plt
import csv

def format_index_size(size):
if size < 1_000_000:
return f"{size // 1_000}K"
else:
return f"{size / 1_000_000:.1f}M"

def parse_index_log(file_path):
with open(file_path, "r") as f:
lines = f.readlines()

index_data = []

time_pattern = re.compile(r'Building (\d+) vectors time: T([\d.]+) seconds')
memory_pattern = re.compile(r'Current index memory usage: .* ([\d.]+) GB')

prev_time = None

for i in range(len(lines)):
time_match = time_pattern.search(lines[i])
if time_match:
index_size = int(time_match.group(1))
time_elapsed = float(time_match.group(2))

memory_match = memory_pattern.search(lines[i+1]) # Memory info is on the next line
if memory_match:
memory_usage = float(memory_match.group(1))

# Compute time difference in hours
batch_time_hr = 0 if prev_time is None else (time_elapsed - prev_time) / 3600
prev_time = time_elapsed

# Format index size
index_size_formatted = format_index_size(index_size)

index_data.append((index_size_formatted, round(batch_time_hr, 2), f"{memory_usage:.2f}"))

# Print the result
print(f"{'Index Size':<12} {'Batch Time (hr)':<15} {'Memory Usage (GB)':<18}")
print("=" * 50)
for row in index_data:
print(f"{row[0]:<12} {row[1]:<15} {row[2]:<18}")

# Export data to CSV
csv_filename = 'index_data.csv'
with open(csv_filename, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["Index Size", "Batch Time (hr)", "Memory Usage (GB)"]) # Write header
writer.writerows(index_data) # Write the data

print(f"Data saved to {csv_filename}")

return index_data

# Run the script with your log file
index_data = parse_index_log("results_M_60_efC_75_vec_10000000_q_10000_madvise_None_bs_10240_mem_limit_2G_pid_171927_log.txt")


# Extracting data for the graph
index_sizes = [row[0] for row in index_data] # Index sizes (formatted as K/M)
batch_times = [row[1] for row in index_data] # Batch elapsed times
memory_usages = [round(float(row[2]), 2) for row in index_data] # Memory usage

# Convert index sizes to numerical values and scale to millions
index_sizes_numeric = []
for size in index_sizes:
if size.endswith("K"):
index_sizes_numeric.append(float(size[:-1]) * 1e3 / 1e6) # Convert to millions
elif size.endswith("M"):
index_sizes_numeric.append(float(size[:-1])) # Already in millions
else:
index_sizes_numeric.append(float(size) / 1e6) # Convert to millions

# Create the plot
fig, ax1 = plt.subplots(figsize=(12, 6))

# Plot Batch Time on the left y-axis
ax1.set_xlabel("Index Size (vectors / 1M)")
ax1.set_ylabel("Batch Time (hr)", color="tab:blue")
ax1.plot(index_sizes_numeric, batch_times, marker='o', linestyle='-', color="tab:blue", label="Batch Time (hr)")
ax1.tick_params(axis='y', labelcolor="tab:blue")

# Set the range and ticks for the left y-axis (Batch Time)
ax1.set_ylim(0, 100) # Adjust this based on your data range
ax1.set_yticks(np.arange(0, 101, 10)) # Set tick marks every 10 units (you can adjust this)

# Create a second y-axis to plot Memory Usage
ax2 = ax1.twinx()
ax2.set_ylabel("Memory Usage (GB)", color="tab:green")
ax2.plot(index_sizes_numeric, memory_usages, marker='s', linestyle='--', color="tab:green", label="Memory Usage (GB)")
ax2.tick_params(axis='y', labelcolor="tab:green")


# Set the range and ticks for the right y-axis (Memory Usage)
ax2.set_ylim(0, 3) # Adjust this based on your data range
ax2.set_yticks(np.arange(0, 3.1, 0.5)) # Set tick marks every 0.5 units (you can adjust this)


# Title and grid
plt.title("Build Index")
ax1.grid(True)

x_value_5M = 5 # Since x-axis is in millions

# Add a vertical dashed line at x = 5M
ax1.axvline(x=x_value_5M, color='red', linestyle='--', linewidth=1)

# Add a note next to the line
y_pos = 0.9
ax1.text(
x_value_5M + 1.5, # Slightly shift the text to the right
ax1.get_ylim()[1] * y_pos, # Position at 80% of the y-axis max value
"Process limit: 5GB",
fontsize=12, color="red",
ha="left", va="center",
bbox=dict(facecolor='white', alpha=0.7, edgecolor='red')
)

ax1.text(
x_value_5M - 1.5, # Slightly shift the text to the right
ax1.get_ylim()[1] * y_pos, # Position at 80% of the y-axis max value
"Process limit: 2GB",
fontsize=12, color="red",
ha="right", va="center",
bbox=dict(facecolor='white', alpha=0.7, edgecolor='red')
)


# Save the figure to a file
plt.tight_layout() # Adjust layout to make sure everything fits
plt.savefig("index_growth.png", dpi=300, bbox_inches="tight")

print("Graph saved as 'index_growth.png'")
Loading