openclimatefix · peterdudfield · Nov 14, 2025 · Nov 16, 2025 · Nov 16, 2025 · Nov 16, 2025
diff --git a/README.md b/README.md
@@ -83,6 +83,8 @@ To run the app locally, you'll need to connect it to the `forecast development d
 
 OCF team members can connect to the `forecast development database` using [these Notion instructions](https://www.notion.so/openclimatefix/Connecting-to-AWS-RDS-bf35b3fbd61f40df9c974c240e042354). Add `DB_URL= (db_url from notion documents)` to a `secrets.toml` file. Follow the instructions in the Notion document to connect to the database v.
 
+To connect to the database platform, use `DATA_PLATFORM_HOST` and `DATA_PLATFORM_PORT`. 
+
 Run app:
 
 ```shell

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "plotly==5.24.1",
     "psycopg2-binary==2.9.10",
     "SQLAlchemy==2.0.36",
-    "streamlit==1.46.1",
+    "streamlit==1.51.0",
     "testcontainers==4.9.0",
     "uvicorn==0.34.0",
     "geopandas==1.0.1",
@@ -35,6 +35,8 @@ dependencies = [
     "torch @ https://download.pytorch.org/whl/cpu/torch-2.3.1%2Bcpu-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64'",
     "torch @ https://download.pytorch.org/whl/cpu/torch-2.3.1-cp312-none-macosx_11_0_arm64.whl ; platform_system == 'Darwin' and platform_machine == 'arm64'",
     "matplotlib>=3.8,<4.0",
+    "dp-sdk",
+    "aiocache",
 ]
 
 [project.optional-dependencies]
@@ -66,6 +68,9 @@ dev-dependencies = [
 index-url = "https://download.pytorch.org/whl/cpu"
 extra-index-url = ["https://pypi.org/simple"]
 
+[tool.uv.sources]
+dp-sdk = { url = "https://github.com/openclimatefix/data-platform/releases/download/v0.13.2/dp_sdk-0.13.2-py3-none-any.whl" }
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 python_files = ["test_*.py"]

diff --git a/src/dataplatform/__init__.py b/src/dataplatform/__init__.py
diff --git a/src/dataplatform/forecast/__init__.py b/src/dataplatform/forecast/__init__.py
diff --git a/src/dataplatform/forecast/cache.py b/src/dataplatform/forecast/cache.py
@@ -0,0 +1,27 @@
+"""Cache utilities for the forecast module."""
+
+from datetime import UTC, datetime, timedelta
+
+from dp_sdk.ocf import dp
+
+from dataplatform.forecast.constant import cache_seconds
+
+
+def key_builder_remove_client(func: callable, *args: list, **kwargs: dict) -> str:
+    """Custom key builder that ignores the client argument for caching purposes."""
+    key = f"{func.__name__}:"
+    for arg in args:
+        if not isinstance(arg, dp.DataPlatformDataServiceStub):
+            key += f"{arg}-"
+
+    for k, v in kwargs.items():
+        key += f"{k}={v}-"
+
+    # get the time now to the closest 5 minutes, this forces a new cache every 5 minutes
+    current_time = datetime.now(UTC).replace(second=0, microsecond=0)
+    current_time = current_time - timedelta(
+        minutes=current_time.minute % (int(cache_seconds / 60)),
+    )
+    key += f"time={current_time}-"
+
+    return key
diff --git a/src/dataplatform/forecast/constant.py b/src/dataplatform/forecast/constant.py
@@ -0,0 +1,24 @@
+"""Constants for the forecast module."""
+
+colours = [
+    "#FFD480",
+    "#FF8F73",
+    "#4675C1",
+    "#65B0C9",
+    "#58B0A9",
+    "#FAA056",
+    "#306BFF",
+    "#FF4901",
+    "#B701FF",
+    "#17E58F",
+]
+
+metrics = {
+    "MAE": "MAE is absolute mean error, average(abs(forecast-actual))",
+    "ME": "ME is mean (bias) error, average((forecast-actual))",
+}
+
+cache_seconds = 300  # 5 minutes
+
+# This is used for a specific case for the UK National and GSP
+observer_names = ["pvlive_in_day", "pvlive_day_after"]
diff --git a/src/dataplatform/forecast/data.py b/src/dataplatform/forecast/data.py
@@ -0,0 +1,243 @@
+"""Functions to get forecast and observation data from Data Platform."""
+
+import time
+from datetime import datetime, timedelta
+
+import betterproto
+import pandas as pd
+from aiocache import Cache, cached
+from dp_sdk.ocf import dp
+
+from dataplatform.forecast.cache import key_builder_remove_client
+from dataplatform.forecast.constant import cache_seconds, observer_names
+
+
+async def get_forecast_data(
+    client: dp.DataPlatformDataServiceStub,
+    location: dp.ListLocationsResponseLocationSummary,
+    start_date: datetime,
+    end_date: datetime,
+    selected_forecasters: list[dp.Forecaster],
+) -> pd.DataFrame:
+    """Get forecast data for the given location and time window."""
+    all_data_df = []
+
+    for forecaster in selected_forecasters:
+        forecaster_data_df = await get_forecast_data_one_forecaster(
+            client,
+            location,
+            start_date,
+            end_date,
+            forecaster,
+        )
+        if forecaster_data_df is not None:
+            all_data_df.append(forecaster_data_df)
+
+    all_data_df = pd.concat(all_data_df, ignore_index=True)
+
+    all_data_df["effective_capacity_watts"] = all_data_df["effective_capacity_watts"].astype(float)
+
+    # get watt value
+    all_data_df["p50_watts"] = all_data_df["p50_fraction"] * all_data_df["effective_capacity_watts"]
+
+    for col in ["p10", "p25", "p75", "p90"]:
+        col_fraction = f"{col}_fraction"
+        if col_fraction in all_data_df.columns:
+            all_data_df[f"{col}_watts"] = (
+                all_data_df[col_fraction] * all_data_df["effective_capacity_watts"]
+            )
+
+    return all_data_df
+
+
+@cached(ttl=cache_seconds, cache=Cache.MEMORY, key_builder=key_builder_remove_client)
+async def get_forecast_data_one_forecaster(
+    client: dp,
+    location: dp.ListLocationsResponseLocationSummary,
+    start_date: datetime,
+    end_date: datetime,
+    selected_forecaster: dp.Forecaster,
+) -> pd.DataFrame | None:
+    """Get forecast data for one forecaster for the given location and time window."""
+    all_data_list_dict = []
+
+    # Grab all the data, in chunks of 30 days to avoid too large requests
+    temp_start_date = start_date
+    while temp_start_date <= end_date:
+        temp_end_date = min(temp_start_date + timedelta(days=30), end_date)
+
+        # fetch data
+        stream_forecast_data_request = dp.StreamForecastDataRequest(
+            location_uuid=location.location_uuid,
+            energy_source=dp.EnergySource.SOLAR,
+            time_window=dp.TimeWindow(
+                start_timestamp_utc=temp_start_date,
+                end_timestamp_utc=temp_end_date,
+            ),
+            forecasters=[selected_forecaster],
+        )
+        forecasts = []
+        async for chunk in client.stream_forecast_data(stream_forecast_data_request):
+            forecasts.append(
+                chunk.to_dict(include_default_values=True, casing=betterproto.Casing.SNAKE),
+            )
+
+        if len(forecasts) > 0:
+            all_data_list_dict.extend(forecasts)
+
+        temp_start_date = temp_start_date + timedelta(days=30)
+
+    all_data_df = pd.DataFrame.from_dict(all_data_list_dict)
+    if len(all_data_df) == 0:
+        return None
+
+    # get plevels into columns and rename them 'fraction
+    columns_before_expand = set(all_data_df.columns)
+    all_data_df = all_data_df.pipe(
+        lambda df: df.join(pd.json_normalize(df["other_statistics_fractions"])),
+    ).drop("other_statistics_fractions", axis=1)
+    new_columns = set(all_data_df.columns) - columns_before_expand
+    if len(new_columns) > 0:
+        all_data_df = all_data_df.rename(columns={col: f"{col}_fraction" for col in new_columns})
+
+    # create column forecaster_name, its forecaster_fullname with version removed
+    all_data_df["forecaster_name"] = all_data_df["forecaster_fullname"].apply(
+        lambda x: x.rsplit(":", 1)[0],  # split from right, max 1 split
+    )
+
+    return all_data_df
+
+
+@cached(ttl=cache_seconds, cache=Cache.MEMORY, key_builder=key_builder_remove_client)
+async def get_all_observations(
+    client: dp.DataPlatformDataServiceStub,
+    location: dp.ListLocationsResponseLocationSummary,
+    start_date: datetime,
+    end_date: datetime,
+) -> pd.DataFrame:
+    """Get all observations for the given location and time window."""
+    all_observations_df = []
+
+    for observer_name in observer_names:
+        # Get all the observations for this observer_name, in chunks of 7 days
+        observation_one_df = []
+        temp_start_date = start_date
+        while temp_start_date <= end_date:
+            temp_end_date = min(temp_start_date + timedelta(days=7), end_date)
+
+            get_observations_request = dp.GetObservationsAsTimeseriesRequest(
+                observer_name=observer_name,
+                location_uuid=location.location_uuid,
+                energy_source=dp.EnergySource.SOLAR,
+                time_window=dp.TimeWindow(temp_start_date, temp_end_date),
+            )
+            get_observations_response = await client.get_observations_as_timeseries(
+                get_observations_request,
+            )
+
+            observations = []
+            for chunk in get_observations_response.values:
+                observations.append(
+                    chunk.to_dict(include_default_values=True, casing=betterproto.Casing.SNAKE),
+                )
+
+            observation_one_df.append(pd.DataFrame.from_dict(observations))
+
+            temp_start_date = temp_start_date + timedelta(days=7)
+
+        observation_one_df = pd.concat(observation_one_df, ignore_index=True)
+        observation_one_df = observation_one_df.sort_values(by="timestamp_utc")
+        observation_one_df["observer_name"] = observer_name
+
+        all_observations_df.append(observation_one_df)
+
+    all_observations_df = pd.concat(all_observations_df, ignore_index=True)
+
+    all_observations_df["effective_capacity_watts"] = all_observations_df[
+        "effective_capacity_watts"
+    ].astype(float)
+
+    all_observations_df["value_watts"] = (
+        all_observations_df["value_fraction"] * all_observations_df["effective_capacity_watts"]
+    )
+    all_observations_df["timestamp_utc"] = pd.to_datetime(all_observations_df["timestamp_utc"])
+
+    return all_observations_df
+
+
+async def get_all_data(
+    client: dp.DataPlatformDataServiceStub,
+    selected_location: dp.ListLocationsResponseLocationSummary,
+    start_date: datetime,
+    end_date: datetime,
+    selected_forecasters: list[dp.Forecaster],
+) -> dict:
+    """Get all forecast and observation data, and merge them."""
+    # get generation data
+    time_start = time.time()
+    all_observations_df = await get_all_observations(
+        client,
+        selected_location,
+        start_date,
+        end_date,
+    )
+    observation_seconds = time.time() - time_start
+
+    # get forcast all data
+    time_start = time.time()
+    all_forecast_data_df = await get_forecast_data(
+        client,
+        selected_location,
+        start_date,
+        end_date,
+        selected_forecasters,
+    )
+    forecast_seconds = time.time() - time_start
+
+    # If the observation data includes pvlive_day_after and pvlive_in_day,
+    # then lets just take pvlive_day_after
+    one_observations_df = all_observations_df.copy()
+    if "pvlive_day_after" in all_observations_df["observer_name"].values:
+        one_observations_df = all_observations_df[
+            all_observations_df["observer_name"] == "pvlive_day_after"
+        ]
+
+    # make target_timestamp_utc
+    all_forecast_data_df["init_timestamp"] = pd.to_datetime(all_forecast_data_df["init_timestamp"])
+    all_forecast_data_df["target_timestamp_utc"] = all_forecast_data_df[
+        "init_timestamp"
+    ] + pd.to_timedelta(all_forecast_data_df["horizon_mins"], unit="m")
+
+    # take the foecast data, and group by horizonMins, forecasterFullName
+    # calculate mean absolute error between p50Fraction and observations valueFraction
+    merged_df = pd.merge(
+        all_forecast_data_df,
+        one_observations_df,
+        left_on=["target_timestamp_utc"],
+        right_on=["timestamp_utc"],
+        how="inner",
+        suffixes=("_forecast", "_observation"),
+    )
+
+    # error and absolute error
+    merged_df["error"] = merged_df["p50_watts"] - merged_df["value_watts"]
+    merged_df["absolute_error"] = merged_df["error"].abs()
+
+    return {
+        "merged_df": merged_df,
+        "all_forecast_data_df": all_forecast_data_df,
+        "all_observations_df": all_observations_df,
+        "forecast_seconds": forecast_seconds,
+        "observation_seconds": observation_seconds,
+    }
+
+
+def align_t0(merged_df: pd.DataFrame) -> pd.DataFrame:
+    """Align t0 forecasts for different forecasters."""
+    # number of unique forecasters
+    num_forecasters = merged_df["forecaster_name"].nunique()
+    # Count number of forecasters that have each t0 time
+    counts = merged_df.groupby("init_timestamp")["forecaster_name"].nunique()
+    # Filter to just those t0s that all forecasters have
+    common_t0s = counts[counts == num_forecasters].index
+    return merged_df[merged_df["init_timestamp"].isin(common_t0s)]