diff --git a/prototype/experiment.ipynb b/prototype/experiment.ipynb new file mode 100644 index 0000000..6fc7b0f --- /dev/null +++ b/prototype/experiment.ipynb @@ -0,0 +1,259 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from typing import Union, Optional\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class TimeConfig:\n", + " \"\"\"Container for storing and accessing time configuration.\"\"\"\n", + " start_year: str\n", + " end_year: str\n", + " target_start: str = '11-01'\n", + " target_end: str = '12-01'\n", + " freq: Optional[str] = '2M' # daily/monthly\n", + " # tfreq: Union[int, str] = 7\n", + " \n", + " def get_dates_load(self):\n", + " \"\"\"Return index which lines up with the bins\n", + " Leap days are removed.\n", + "\n", + " (see the function \"timeseries_tofit_bins\" in line 628 of functions_pp)\n", + "\n", + " return pd.DateTimeIndex\n", + " \"\"\"\n", + "\n", + " return\n", + "\n", + " def get_resample_bins(self):\n", + " \"\"\"Return bins to aggregate to n day/ n month means\n", + " This function could work with the precursor and target variables.\n", + " \"\"\"\n", + "\n", + " return\n", + "\n", + " def split_train_test_groups(self):\n", + " \"\"\"\n", + " \"\"\"\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tc = TimeConfig(start_year, end_year, target_start, target_end, freq)\n", + "tc.dates2bin # time passed to data loader\n", + "tc.labelbins # same axis as dates2bin, but has labels to group by n day/ n month means\n", + "tc.traintestgroups # smart splitting of train/test data and avoid overlapping\n", + "tc.df_split # aggregated time axis pandas dataframe with columns train=true and RV_mask" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "@dataclass\n", + "class TrainTestSet:\n", + " \"\"\"Base class for various traintest methods.\"\"\"\n", + " df: pd.DataFrame\n", + " \n", + " def get_test_labels(self):\n", + " return self.df.query('traintest==1')\n", + " \n", + " def get_training_labels(self):\n", + " return self.df.query('traintest==2')\n", + "\n", + "class LeaveOutN(TrainTestSet):\n", + " \"\"\"TrainTestSet based on a leave-n-out sampling method.\"\"\"\n", + " def __init__(self, timeconfig, n, max_lag=None):\n", + " times = timeconfig.datetimes\n", + " df = pd.DataFrame(index=times, columns=[\"traintest\"])\n", + " df['traintest'] = np.random.randint(0, 3, len(df))\n", + " self.df = df\n", + "\n", + "class Random(TrainTestSet):\n", + " \"\"\"TrainTestSet based on a random sampling method.\"\"\"\n", + " n: int\n", + "\n", + "class Ranstrat(TrainTestSet):\n", + " \"\"\"TrainTestSet based on a random stratified sampling method.\"\"\"\n", + " n: int\n", + "\n", + "class split(TrainTestSet):\n", + " \"\"\"TrainTestSet based on a simple split method.\"\"\"\n", + " n: int\n", + "\n", + "class TimeSeriesSplit(TrainTestSet):\n", + " \"\"\"TrainTestSet based on a \"one-step-ahead\" method.\"\"\"\n", + " n: int\n", + "\n", + "class RepeatedKfold(TrainTestSet):\n", + " \"\"\"TrainTestSet based on a repeated k-fold with different randomizations.\"\"\"\n", + " n_repeats: int\n", + " n_folds: int" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
traintest
1979-01-032
1979-01-092
1979-01-162
1979-01-182
1979-01-192
......
2020-12-082
2020-12-142
2020-12-242
2020-12-292
2020-12-302
\n", + "

5082 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " traintest\n", + "1979-01-03 2\n", + "1979-01-09 2\n", + "1979-01-16 2\n", + "1979-01-18 2\n", + "1979-01-19 2\n", + "... ...\n", + "2020-12-08 2\n", + "2020-12-14 2\n", + "2020-12-24 2\n", + "2020-12-29 2\n", + "2020-12-30 2\n", + "\n", + "[5082 rows x 1 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tc = TimeConfig(start_year = '1979', end_year = '2021', freq='D')\n", + "ttset = LeaveOutN(tc, n=5)\n", + "ttset.get_training_labels()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class CrossValidator():\n", + " \"\"\"Perform cross-validation of training/testing data\"\"\"\n", + " def __init__(self):" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "08d8a5f855c7a1e3d1e204794a733da246b168cd162c0e6afd9e496498bf62ad" + }, + "kernelspec": { + "display_name": "Python 3.8.12 ('eucp')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/prototype/prototype.ipynb b/prototype/prototype.ipynb new file mode 100644 index 0000000..dac06c3 --- /dev/null +++ b/prototype/prototype.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import S2Sinit\n", + "import climpp\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initialize the workflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define parameters for time config\n", + "start_year= 1979\n", + "end_year = 2020\n", + "target_start = '11-01'\n", + "target_end = '12-01'\n", + "freq = '2M' # two month means, with pandas\n", + "# define parameters for cross-validator\n", + "max_lag = 5\n", + "k = 10\n", + "\n", + "# this function only allows single target period\n", + "# data on the same coordinate\n", + "initializer = S2Sinit(start_year, end_year, target_start,\n", + " target_end, freq)\n", + " \n", + "# ------------------ optional procedure\n", + "# call cross validator\n", + "initializer.cv.kfold(max_lag, k) # only specify parameters for cross-validation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Preprocessing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define parameters for preprocessing\n", + "path2target = 'path/target_var'\n", + "path2sst = 'path/sst'\n", + "path2z500 = 'path/z500'\n", + "# preprocess recipe for target variable\n", + "preprocess_steps_target = ['detrend', 'deseasonalize', 'standardize']\n", + "# preprocess recipe for precursors\n", + "preprocess_steps_sst = ['detrend', 'deseasonalize', 'standardize']\n", + "preprocess_steps_z500 = ['detrend', 'deseasonalize']\n", + "\n", + "# preprocess target\n", + "target_instance = climpp(initializer, path2target, preprocess_steps_target)\n", + "target_instance = climpp(initializer, path2target, preprocess_steps_target)\n", + "# preprocess precursor sst\n", + "sst_instance = climpp(initializer, path2sst, preprocess_steps_sst)\n", + "z500_instance = climpp(initializer, path2z500, preprocess_steps_z500)\n", + "\n", + "# execute preprocessing steps\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}