RedisAI · meiravgri · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -66,11 +66,11 @@ if(VECSIM_BUILD_TESTS)
 	FetchContent_MakeAvailable(google_benchmark)
 
 	add_subdirectory(tests/unit unit_tests)
-	add_subdirectory(tests/module module_tests)
+	# add_subdirectory(tests/module module_tests)
 
-	if(NOT(USE_ASAN OR USE_MSAN))
-		add_subdirectory(tests/benchmark benchmark)
-	endif()
+	# if(NOT(USE_ASAN OR USE_MSAN))
+	# 	add_subdirectory(tests/benchmark benchmark)
+	# endif()
 endif()
 
 add_subdirectory(src/VecSim)

diff --git a/create_bm_graphs.py b/create_bm_graphs.py
@@ -0,0 +1,130 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import re
+
+# ---- 1. Process CSV File (Memory Usage) ----
+def plot_memory_usage(csv_file, output_file):
+    df = pd.read_csv(csv_file)
+    # Convert memory from bytes to GB
+    df["memory_gb"] = df["memory_bytes"] / (1024**3)
+
+    num_entries = 1000
+    step = 5
+    df = df.head(num_entries).iloc[::step]
+
+    plt.figure(figsize=(30, 20))
+    plt.scatter(df["time_stamp"], df["memory_gb"], label="Memory Usage (GB)", color='b', marker='o', s=50)
+    plt.xlabel("Time (Unix Timestamp)")
+    plt.ylabel("Memory (GB)")
+    plt.title("Memory Usage Over Time")
+    plt.legend()
+    plt.grid()
+
+    plt.savefig(output_file)  # Save as PNG file
+    plt.close()  # Close the plot to free memory
+
+# ---- 2. Process Log File (Index Size) ----
+def parse_log_file(log_file):
+    pattern = re.compile(r"\[(\d+\.\d+)\] Building (\d+) vectors time:  T(\d+\.\d+) seconds")
+
+    timestamps = []
+    index_sizes_m = []
+
+    with open(log_file, 'r') as file:
+        for line in file:
+            match = pattern.search(line)
+            if match:
+                timestamp, index_size, _ = match.groups()
+                timestamps.append(float(timestamp))
+                index_sizes_m.append(int(index_size) / 1_000_000)  # Convert to millions
+
+    return timestamps, index_sizes_m
+
+def plot_index_size(log_file, output_file):
+    timestamps, index_sizes = parse_log_file(log_file)
+
+    plt.figure(figsize=(10, 5))
+    plt.plot(timestamps, index_sizes, label="Index Size (vectors)", color='r')
+    plt.xlabel("Time (Unix Timestamp)")
+    plt.ylabel("Index Size (vectors)")
+    plt.title("Index Size Over Time")
+    plt.legend()
+    plt.grid()
+
+    plt.savefig(output_file)  # Save as PNG file
+    plt.close()  # Close the plot to free memory
+
+def parse_log_file(log_file):
+    pattern = re.compile(r"\[(\d+\.\d+)\] Building (\d+) vectors time:  T(\d+\.\d+) seconds")
+    timestamps = []
+    index_sizes_m = []  # Store index sizes in millions
+
+    with open(log_file, 'r') as file:
+        for line in file:
+            match = pattern.search(line)
+            if match:
+                timestamp, index_size, _ = match.groups()
+                timestamps.append(float(timestamp))
+                index_sizes_m.append(int(index_size) / 1_000_000)  # Convert to millions
+
+    return timestamps, index_sizes_m
+
+def plot_combined(csv_file, log_file, output_file, num_entries=None, step=5):
+    # Read the CSV file for memory usage
+    df = pd.read_csv(csv_file)
+
+    if num_entries is None:
+        num_entries = len(df)
+
+    # Convert memory from bytes to GB
+    df["memory_gb"] = df["memory_bytes"] / (1024**3)
+
+    # Subtract the first memory value to adjust all memory values
+    initial_memory = df["memory_gb"].iloc[0]
+    df["memory_gb"] = df["memory_gb"] - initial_memory
+
+
+    # Select only the first `num_entries` rows and take every `step`-th row
+    df = df.head(num_entries).iloc[::step]
+
+    # Parse the log file for index size data
+    timestamps, index_sizes_m = parse_log_file(log_file)
+
+    # Create the plot with dual y-axes
+    fig, ax1 = plt.subplots(figsize=(20, 10))
+
+    # Plot memory usage on the first y-axis (left)
+    ax1.set_xlabel("Time (Unix Timestamp)")
+    ax1.set_ylabel("Memory Usage (GB)", color='b')
+    ax1.scatter(df["time_stamp"], df["memory_gb"], color='b', marker='o', s=10, label="Memory Usage")
+    ax1.tick_params(axis='y', labelcolor='b')
+
+    # Create a second y-axis for the index size
+    ax2 = ax1.twinx()
+    ax2.set_ylabel("Index Size (M vectors)", color='r')
+    ax2.scatter(timestamps, index_sizes_m, color='r', marker='x', s=10, label="Index Size")
+    ax2.tick_params(axis='y', labelcolor='r')
+
+    # Set the title and legend
+    plt.title("Memory Usage and Index Size Over Time")
+    ax1.legend(loc='upper left')
+    ax2.legend(loc='upper right')
+
+    plt.grid(True)
+
+    # Save the plot to a file
+    plt.savefig(output_file)
+    plt.close()  # Close the plot to free memory
+
+    print(f"Combined plot saved to {output_file}")
+
+# ---- Run the plots ----
+result_details = "results_M_60_efC_75_vec_10000000_q_10000_madvise_MADV_DONTNEED_bs_10240"
+csv_file = f"results/mem_monitor/{result_details}_pid_1335459_rss_memory_monitor.csv"
+log_file = f"results/logs/{result_details}_log.txt"
+
+# plot_memory_usage(csv_file, f"results/graphs/{result_details}_memory_usage.png")
+# plot_index_size(log_file, f"results/graphs/{result_details}_index_size.png")
+plot_combined(csv_file, log_file, f"results/graphs/{result_details}_combined.png", step = 20)
+
+print("Graphs saved: memory_usage.png, index_size.png")
diff --git a/monitor_proc_rss.py b/monitor_proc_rss.py
@@ -0,0 +1,60 @@
+import psutil
+import time
+import csv
+import argparse
+
+def collect_process_memory(pid, output_file):
+    """ Continuously log memory usage of a process to a CSV file until it dies. """
+    with open(output_file, mode='w', newline='') as file:
+        writer = csv.writer(file)
+        writer.writerow(["time_stamp", "memory_bytes", "memory_current", "memory_high"])
+
+        i = 0
+        print_interval = 600  # Print every 10 min
+        while True:
+            try:
+                proc = psutil.Process(pid)
+                memory = proc.memory_info().rss  # Get memory in bytes
+                timestamp = time.time()  # Unix timestamp (float)
+
+                # Read memory.current and memory.event::high from cgroup
+                memory_current = get_memory_current(pid)
+                memory_high = get_memory_high(pid)
+
+                writer.writerow([timestamp, memory, memory_current, memory_high])
+                if i % print_interval == 0:
+                    print(f"{timestamp}, rss: {memory} bytes, {(memory / 1024 / 1024 / 1024):.4f} GB, memory.current: {(memory_current / 1024 / 1024 / 1024):.4f} GB, memory.event::high: {memory_high}")
+
+                time.sleep(1)  # Adjust sampling interval if needed
+                i += 1
+            except psutil.NoSuchProcess:
+                print(f"Process {pid} has ended.")
+                break
+
+def get_memory_current(pid):
+    """ Read memory.current from cgroup for the specified process. """
+    with open(f"/sys/fs/cgroup/limited_process/memory.current", "r") as f:
+        memory_current = f.read().strip()
+        return int(memory_current)  # Return value in bytes
+
+def get_memory_high(pid):
+    """ Read memory.event::high from cgroup for the specified process. """
+    with open(f"/sys/fs/cgroup/limited_process/memory.events", "r") as f:
+        for line in f:
+            if "high" in line:
+                memory_high = line.strip().split()[1]
+                return int(memory_high)  # Return value in bytes
+
+def generate_file_name(M, efC, num_vectors, num_queries, madvise, block_size, process_limit_high):
+    return f"results_M_{M}_efC_{efC}_vec_{num_vectors}_q_{num_queries}_madvise_{madvise}_bs_{block_size}_mem_limit_{process_limit_high}"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Monitor RSS memory usage of a process.")
+    parser.add_argument("pid", type=int, help="PID of the process to monitor")
+    args = parser.parse_args()
+
+    pid = args.pid  # Get the PID from the command line argument
+    run_name = generate_file_name(M=60, efC=75, num_vectors=10_000_000, num_queries=10_000, madvise="None", block_size=10_240, process_limit_high="55G")
+    output_file = f"{run_name}_pid_{pid}_rss_memory_monitor.csv"
+    print("Start collecting memory usage for process", pid)
+    collect_process_memory(pid, output_file)
diff --git a/parse_index_log.py b/parse_index_log.py
@@ -0,0 +1,137 @@
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+import csv
+
+def format_index_size(size):
+    if size < 1_000_000:
+        return f"{size // 1_000}K"
+    else:
+        return f"{size / 1_000_000:.1f}M"
+
+def parse_index_log(file_path):
+    with open(file_path, "r") as f:
+        lines = f.readlines()
+
+    index_data = []
+
+    time_pattern = re.compile(r'Building (\d+) vectors time:  T([\d.]+) seconds')
+    memory_pattern = re.compile(r'Current index memory usage: .* ([\d.]+) GB')
+
+    prev_time = None
+
+    for i in range(len(lines)):
+        time_match = time_pattern.search(lines[i])
+        if time_match:
+            index_size = int(time_match.group(1))
+            time_elapsed = float(time_match.group(2))
+
+            memory_match = memory_pattern.search(lines[i+1])  # Memory info is on the next line
+            if memory_match:
+                memory_usage = float(memory_match.group(1))
+
+                # Compute time difference in hours
+                batch_time_hr = 0 if prev_time is None else (time_elapsed - prev_time) / 3600
+                prev_time = time_elapsed
+
+                # Format index size
+                index_size_formatted = format_index_size(index_size)
+
+                index_data.append((index_size_formatted, round(batch_time_hr, 2), f"{memory_usage:.2f}"))
+
+    # Print the result
+    print(f"{'Index Size':<12} {'Batch Time (hr)':<15} {'Memory Usage (GB)':<18}")
+    print("=" * 50)
+    for row in index_data:
+        print(f"{row[0]:<12} {row[1]:<15} {row[2]:<18}")
+
+    # Export data to CSV
+    csv_filename = 'index_data.csv'
+    with open(csv_filename, mode='w', newline='') as file:
+        writer = csv.writer(file)
+        writer.writerow(["Index Size", "Batch Time (hr)", "Memory Usage (GB)"])  # Write header
+        writer.writerows(index_data)  # Write the data
+
+    print(f"Data saved to {csv_filename}")
+
+    return index_data
+
+# Run the script with your log file
+index_data = parse_index_log("results_M_60_efC_75_vec_10000000_q_10000_madvise_None_bs_10240_mem_limit_2G_pid_171927_log.txt")
+
+
+# Extracting data for the graph
+index_sizes = [row[0] for row in index_data]  # Index sizes (formatted as K/M)
+batch_times = [row[1] for row in index_data]  # Batch elapsed times
+memory_usages = [round(float(row[2]), 2) for row in index_data]  # Memory usage
+
+# Convert index sizes to numerical values and scale to millions
+index_sizes_numeric = []
+for size in index_sizes:
+    if size.endswith("K"):
+        index_sizes_numeric.append(float(size[:-1]) * 1e3 / 1e6)  # Convert to millions
+    elif size.endswith("M"):
+        index_sizes_numeric.append(float(size[:-1]))  # Already in millions
+    else:
+        index_sizes_numeric.append(float(size) / 1e6)  # Convert to millions
+
+# Create the plot
+fig, ax1 = plt.subplots(figsize=(12, 6))
+
+# Plot Batch Time on the left y-axis
+ax1.set_xlabel("Index Size (vectors / 1M)")
+ax1.set_ylabel("Batch Time (hr)", color="tab:blue")
+ax1.plot(index_sizes_numeric, batch_times, marker='o', linestyle='-', color="tab:blue", label="Batch Time (hr)")
+ax1.tick_params(axis='y', labelcolor="tab:blue")
+
+# Set the range and ticks for the left y-axis (Batch Time)
+ax1.set_ylim(0, 100)  # Adjust this based on your data range
+ax1.set_yticks(np.arange(0, 101, 10))  # Set tick marks every 10 units (you can adjust this)
+
+# Create a second y-axis to plot Memory Usage
+ax2 = ax1.twinx()
+ax2.set_ylabel("Memory Usage (GB)", color="tab:green")
+ax2.plot(index_sizes_numeric, memory_usages, marker='s', linestyle='--', color="tab:green", label="Memory Usage (GB)")
+ax2.tick_params(axis='y', labelcolor="tab:green")
+
+
+# Set the range and ticks for the right y-axis (Memory Usage)
+ax2.set_ylim(0, 3)  # Adjust this based on your data range
+ax2.set_yticks(np.arange(0, 3.1, 0.5))  # Set tick marks every 0.5 units (you can adjust this)
+
+
+# Title and grid
+plt.title("Build Index")
+ax1.grid(True)
+
+x_value_5M = 5  # Since x-axis is in millions
+
+# Add a vertical dashed line at x = 5M
+ax1.axvline(x=x_value_5M, color='red', linestyle='--', linewidth=1)
+
+# Add a note next to the line
+y_pos = 0.9
+ax1.text(
+    x_value_5M + 1.5,  # Slightly shift the text to the right
+    ax1.get_ylim()[1] * y_pos,  # Position at 80% of the y-axis max value
+    "Process limit: 5GB",
+    fontsize=12, color="red",
+    ha="left", va="center",
+    bbox=dict(facecolor='white', alpha=0.7, edgecolor='red')
+)
+
+ax1.text(
+    x_value_5M - 1.5,  # Slightly shift the text to the right
+    ax1.get_ylim()[1] * y_pos,  # Position at 80% of the y-axis max value
+    "Process limit: 2GB",
+    fontsize=12, color="red",
+    ha="right", va="center",
+    bbox=dict(facecolor='white', alpha=0.7, edgecolor='red')
+)
+
+
+# Save the figure to a file
+plt.tight_layout()  # Adjust layout to make sure everything fits
+plt.savefig("index_growth.png", dpi=300, bbox_inches="tight")
+
+print("Graph saved as 'index_growth.png'")