-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ Transform 1-minute data into any OHLC timeframe (#10)
* ✨ Transform OHLC data * 📖 Add more examples * 🎨 Modify warning log * 🎨 Set default log level to info * 🎨 Improve error handling * 🎨 Validate timeframe correctly * 📦 Add larger test dataset * ✅ 100% test coverage for transform module * 🎨 * 🔖 0.3.0 * 🎨 Modify logs * 🔧 Ignore ruff line-too-long rule * 🎨 * 🎨 Modify error / warning note * ✨ Use chunk-based method when it's faster than rolling aggregation * 🧪 Experimental script for rolling vs chunk runtime comparison
- Loading branch information
Showing
11 changed files
with
1,971 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
"""This script compares the time taken to compute OHLC data using the rolling window vs chunk-aggregation methods. | ||
Results show that chunk-based aggregation may be faster when the number of chunks is less than 18000. | ||
""" | ||
|
||
import random | ||
import timeit | ||
|
||
import pandas as pd | ||
|
||
from ohlc_toolkit import read_ohlc_csv | ||
from ohlc_toolkit.config.log_config import get_logger | ||
from ohlc_toolkit.transform import rolling_ohlc | ||
|
||
logger = get_logger(__name__) | ||
# Load the sample dataset | ||
df_1min = read_ohlc_csv("data/btcusd_bitstamp_1min_latest.csv", timeframe="1m") | ||
|
||
# Define the parameters for the test | ||
timeframe = "1w" | ||
timeframe_minutes = 10080 | ||
step_sizes = [3, 4, 5, 6, 10, 15] | ||
|
||
|
||
def chunk_based_aggregation(df, timeframe_minutes, step_size_minutes): | ||
"""Perform chunk-based aggregation.""" | ||
logger.debug( | ||
"Chunk-based aggregation: {}, {}. {} chunks".format( | ||
timeframe_minutes, step_size_minutes, len(df) / step_size_minutes | ||
) | ||
) | ||
aggregated_data = [] | ||
for start in range(0, len(df), step_size_minutes): | ||
end = start + timeframe_minutes | ||
if end > len(df): | ||
break | ||
|
||
window_df = df.iloc[start:end] | ||
aggregated_row = { | ||
"timestamp": window_df.index[-1], | ||
"open": window_df["open"].iloc[0], | ||
"high": window_df["high"].max(), | ||
"low": window_df["low"].min(), | ||
"close": window_df["close"].iloc[-1], | ||
"volume": window_df["volume"].sum(), | ||
} | ||
aggregated_data.append(aggregated_row) | ||
return pd.DataFrame(aggregated_data) | ||
|
||
|
||
def test_rolling_aggregation(step_size): | ||
"""Test the rolling aggregation method.""" | ||
df_agg = rolling_ohlc(df_1min, timeframe_minutes) | ||
df_agg = df_agg.iloc[::step_size] | ||
|
||
|
||
def test_chunk_aggregation(step_size): | ||
"""Test the chunk-based aggregation method.""" | ||
chunk_based_aggregation(df_1min, timeframe_minutes, step_size) | ||
|
||
|
||
# Run the performance tests | ||
rolling_time = ( | ||
timeit.timeit(lambda: test_rolling_aggregation(random.choice(step_sizes)), number=5) | ||
/ 5 | ||
) | ||
for step_size in step_sizes: | ||
chunk_time = timeit.timeit(lambda: test_chunk_aggregation(step_size), number=3) / 3 | ||
|
||
print(f"Step size: {step_size} minutes") | ||
print(f"Rolling aggregation time: {rolling_time:.4f} seconds") | ||
print(f"Chunk-based aggregation time: {chunk_time:.4f} seconds") | ||
print("-" * 40) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
[tool.poetry] | ||
name = "ohlc-toolkit" | ||
version = "0.2.0" | ||
description = "A flexible toolkit for working with OHLC data and generating custom time frames from minute data." | ||
version = "0.3.0" | ||
description = "A flexible toolkit for working with OHLC data and generating custom timeframes from minute data." | ||
authors = ["Mourits de Beer <[email protected]>"] | ||
license = "MIT" | ||
readme = "README.md" | ||
|
@@ -51,6 +51,7 @@ lint.ignore = [ | |
"D408", | ||
"D409", | ||
"D413", | ||
"E501", | ||
] | ||
include = ["src/*.py"] | ||
line-length = 88 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
"""OHLC Toolkit.""" | ||
|
||
from ohlc_toolkit.csv_reader import read_ohlc_csv | ||
from ohlc_toolkit.transform import transform_ohlc | ||
|
||
__all__ = [ | ||
"read_ohlc_csv", | ||
"transform_ohlc", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
"""Transform OHLC data.""" | ||
|
||
from logging import Logger | ||
from typing import Union | ||
|
||
import pandas as pd | ||
|
||
from ohlc_toolkit.config.log_config import get_logger | ||
from ohlc_toolkit.timeframes import parse_timeframe, validate_timeframe | ||
from ohlc_toolkit.utils import check_data_integrity | ||
|
||
LOGGER = get_logger(__name__) | ||
|
||
|
||
def _first(row: pd.Series) -> float: | ||
"""Get the first value of a row, for rolling_ohlc aggregation.""" | ||
return row.iloc[0] | ||
|
||
|
||
def _last(row: pd.Series) -> float: | ||
"""Get the last value of a row, for rolling_ohlc aggregation.""" | ||
return row.iloc[-1] | ||
|
||
|
||
def rolling_ohlc(df_input: pd.DataFrame, timeframe_minutes: int) -> pd.DataFrame: | ||
"""Rolling OHLC aggregation. | ||
Args: | ||
df_input (pd.DataFrame): The input DataFrame with OHLC data. | ||
timeframe_minutes (int): The timeframe in minutes for the rolling window. | ||
Returns: | ||
pd.DataFrame: The aggregated OHLC data, with same schema as the input DataFrame. | ||
""" | ||
LOGGER.info( | ||
"Computing OHLC for a rolling window of {} minutes over {} rows. " | ||
"The ratio of rows to timeframe is {:.2f}.", | ||
timeframe_minutes, | ||
len(df_input), | ||
len(df_input) // timeframe_minutes, | ||
) | ||
return df_input.rolling(timeframe_minutes).agg( | ||
{ | ||
"timestamp": _last, | ||
"open": _first, | ||
"high": "max", | ||
"low": "min", | ||
"close": _last, | ||
"volume": "sum", | ||
} | ||
) | ||
|
||
|
||
def _cast_to_original_dtypes( | ||
original_df: pd.DataFrame, transformed_df: pd.DataFrame | ||
) -> pd.DataFrame: | ||
"""Cast the transformed DataFrame to the original DataFrame's data types. | ||
Args: | ||
original_df (pd.DataFrame): The original DataFrame with the desired data types. | ||
transformed_df (pd.DataFrame): The transformed DataFrame to be cast. | ||
Returns: | ||
pd.DataFrame: The transformed DataFrame with data types matching the original. | ||
""" | ||
LOGGER.debug("Casting transformed DataFrame to original dtypes") | ||
for column in transformed_df.columns: | ||
if column in original_df.columns: | ||
transformed_df[column] = transformed_df[column].astype( | ||
original_df[column].dtype | ||
) | ||
return transformed_df | ||
|
||
|
||
def _drop_expected_nans(df: pd.DataFrame, logger: Logger) -> pd.DataFrame: | ||
"""Drop the expected NaNs from the DataFrame. | ||
We expect the first `timeframe_minutes - 1` rows to be NaNs from the aggregation. | ||
However, we don't want to drop all NaNs in case there are unexpected ones. | ||
Therefore, we drop the expected NaNs and proceed with data integrity checks. | ||
Args: | ||
df (pd.DataFrame): The DataFrame to drop NaNs from. | ||
logger (Logger): The logger to use. | ||
Returns: | ||
pd.DataFrame: The DataFrame with expected NaNs dropped. | ||
""" | ||
logger.debug("Dropping expected NaN values from the aggregated DataFrame") | ||
n = df.first_valid_index() # Get the index of the first valid row | ||
if n is None: | ||
logger.error("No valid rows after aggregation.") | ||
raise ValueError("No valid rows after aggregation.") | ||
|
||
n_pos = df.index.get_loc(n) | ||
|
||
result = pd.concat([df.iloc[:n_pos].dropna(), df.iloc[n_pos:]]) | ||
return result | ||
|
||
|
||
def transform_ohlc( | ||
df_input: pd.DataFrame, timeframe: Union[int, str], step_size_minutes: int = 1 | ||
) -> pd.DataFrame: | ||
"""Transform OHLC data to a different timeframe resolution. | ||
Args: | ||
df_input (pd.DataFrame): Input DataFrame with OHLC data. | ||
timeframe (Union[int, str]): Desired timeframe resolution, which can be | ||
an integer (in minutes) or a string (e.g., '1h', '4h30m'). | ||
step_size_minutes (int): Step size in minutes for the rolling window. | ||
Returns: | ||
pd.DataFrame: Transformed OHLC data. | ||
""" | ||
df = df_input.copy() | ||
bound_logger = LOGGER.bind( | ||
body={"timeframe": timeframe, "step_size": step_size_minutes} | ||
) | ||
bound_logger.debug("Starting transformation of OHLC data") | ||
|
||
# Convert string timeframe to minutes if necessary | ||
if isinstance(timeframe, str): | ||
timeframe_seconds = parse_timeframe(timeframe) | ||
bound_logger.debug("Parsed timeframe string to seconds: {}", timeframe_seconds) | ||
if timeframe_seconds % 60 != 0: | ||
bound_logger.error("Second-level timeframes are not yet supported.") | ||
raise NotImplementedError("Second-level timeframes are not yet supported.") | ||
timeframe_minutes = timeframe_seconds // 60 | ||
elif isinstance(timeframe, int): | ||
timeframe_minutes = timeframe | ||
else: | ||
bound_logger.error("Invalid timeframe provided: {}", timeframe) | ||
raise ValueError(f"Invalid timeframe: {timeframe}") | ||
|
||
time_step_seconds = step_size_minutes * 60 | ||
validate_timeframe( | ||
time_step=time_step_seconds, | ||
user_timeframe=timeframe_minutes * 60, | ||
logger=bound_logger, | ||
) | ||
|
||
bound_logger.debug( | ||
"Using timeframe of {} minutes for rolling aggregation", timeframe_minutes | ||
) | ||
|
||
# The following cut-off was determined to be where chunk-based aggregation is faster | ||
# than rolling aggregation. See scripts/experiment/chunk_vs_rolling_aggregation.py | ||
chunk_cut_off = 18000 | ||
num_rows = len(df) | ||
num_chunks = num_rows // step_size_minutes | ||
if step_size_minutes == 1 or num_chunks > chunk_cut_off: | ||
# Use rolling aggregation for small step sizes or large datasets | ||
bound_logger.debug( | ||
"Using rolling aggregation for step size: {}. " | ||
"The number of rows would yield {} chunks", | ||
step_size_minutes, | ||
num_chunks, | ||
) | ||
df_agg = rolling_ohlc(df, timeframe_minutes) | ||
df_agg = df_agg.iloc[::step_size_minutes] | ||
else: | ||
# Use chunk-based aggregation when data step is large relative to num rows | ||
bound_logger.info( | ||
"Using chunk-based aggregation for step size: {}. " | ||
"The {} rows yield {} chunks", | ||
step_size_minutes, | ||
num_rows, | ||
num_chunks, | ||
) | ||
aggregated_data = [] | ||
for start in range(0, num_rows, step_size_minutes): | ||
end = start + timeframe_minutes | ||
if end > num_rows: | ||
if not aggregated_data: | ||
bound_logger.error( | ||
"Selected timeframe is too large. {} rows are not enough for " | ||
"this timeframe: {} ({} minutes).", # TODO: Assuming 1-minute. | ||
num_rows, | ||
timeframe, | ||
timeframe_minutes, | ||
) | ||
raise ValueError( | ||
"Timeframe too large. Please ensure your dataset is big enough " | ||
f"for this timeframe: {timeframe} ({timeframe_minutes} minutes)." | ||
) | ||
break | ||
|
||
window_df = df.iloc[start:end] | ||
aggregated_row = { | ||
"timestamp": window_df["timestamp"].iloc[-1], | ||
"open": window_df["open"].iloc[0], | ||
"high": window_df["high"].max(), | ||
"low": window_df["low"].min(), | ||
"close": window_df["close"].iloc[-1], | ||
"volume": window_df["volume"].sum(), | ||
} | ||
aggregated_data.append(aggregated_row) | ||
|
||
df_agg = pd.DataFrame(aggregated_data) | ||
df_agg = df_agg.sort_values("timestamp") | ||
|
||
# Drop the expected NaNs | ||
try: | ||
df_agg = _drop_expected_nans(df_agg, bound_logger) | ||
except ValueError as e: | ||
raise ValueError( | ||
f"{str(e)} Please ensure your dataset is big enough " | ||
f"for this timeframe: {timeframe} ({timeframe_minutes} minutes)." | ||
) from e | ||
|
||
# Cast the transformed DataFrame to the original DataFrame's data types | ||
df_agg = _cast_to_original_dtypes(df_input, df_agg) | ||
|
||
# Do a check to ensure index of dataframe is a datetime index | ||
if not pd.api.types.is_datetime64_any_dtype(df_input.index): | ||
bound_logger.debug( | ||
"DataFrame index is not a datetime index, sorting by timestamp" | ||
) | ||
df = df.sort_values("timestamp") # Ensure timestamp is sorted | ||
|
||
# Convert the timestamp column to a datetime index | ||
df.index = pd.to_datetime(df["timestamp"], unit="s") | ||
df.index.name = "datetime" | ||
bound_logger.debug("Converted timestamp column to datetime index") | ||
|
||
check_data_integrity( | ||
df_agg, logger=bound_logger, time_step_seconds=time_step_seconds | ||
) | ||
|
||
return df_agg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.