From c8fcf58841ac55cac7a9e5995a4ffb23ee24a2c5 Mon Sep 17 00:00:00 2001 From: Stephan Rasp Date: Tue, 28 Nov 2023 09:52:27 +0100 Subject: [PATCH 1/2] Updated documentation. Set version to 0.2.0. --- docs/source/api.md | 24 + docs/source/command-line-scripts.md | 162 +- docs/source/data-guide.ipynb | 39015 ++++++++++++++++---------- docs/source/official-evaluation.md | 493 +- setup.py | 2 +- 5 files changed, 24670 insertions(+), 15026 deletions(-) diff --git a/docs/source/api.md b/docs/source/api.md index c1f1fb7..72b818b 100644 --- a/docs/source/api.md +++ b/docs/source/api.md @@ -21,6 +21,8 @@ metrics.Bias metrics.SpatialBias metrics.ACC + metrics.SEEPS + metrics.SpatialSEEPS ``` ### Probabilistic Metrics @@ -30,13 +32,22 @@ metrics.EnsembleMetric metrics.CRPS + metrics.SpatialCRPS metrics.CRPSSpread + metrics.SpatialCRPSSpread metrics.CRPSSkill + metrics.SpatialCRPSSkill metrics.EnsembleStddev + metrics.EnsembleVariance + metrics.SpatialEnsembleVariance metrics.EnsembleMeanRMSE + metrics.EnsembleMeanMSE + metrics.SpatialEnsembleMeanMSE metrics.EnergyScore metrics.EnergyScoreSpread metrics.EnergyScoreSkill + metrics.RankHistogram + metrics.GaussianCRPS ``` ## Config @@ -63,6 +74,7 @@ regions.SliceRegion regions.ExtraTropicalRegion regions.LandRegion + regions.CombinedRegion ``` ## Derived Variables @@ -73,6 +85,18 @@ derived_variables.DerivedVariable derived_variables.WindSpeed + derived_variables.WindDivergence + derived_variables.WindVorticity + derived_variables.VerticalVelocity + derived_variables.EddyKineticEnergy + derived_variables.GeostrophicWindSpeed + derived_variables.AgeostrophicWindSpeed + derived_variables.UComponentOfAgeostrophicWind + derived_variables.VComponentOfAgeostrophicWind + derived_variables.LapseRate + derived_variables.TotalColumnWater + derived_variables.IntegratedWaterTransport + derived_variables.RelativeHumidity derived_variables.PrecipitationAccumulation derived_variables.AggregatePrecipitationAccumulation derived_variables.ZonalEnergySpectrum diff --git a/docs/source/command-line-scripts.md b/docs/source/command-line-scripts.md index 1e0ae92..85a4dac 100644 --- a/docs/source/command-line-scripts.md +++ b/docs/source/command-line-scripts.md @@ -19,7 +19,9 @@ usage: evaluate.py [-h] [--probabilistic_climatology_start_year PROBABILISTIC_CLIMATOLOGY_START_YEAR] [--probabilistic_climatology_end_year PROBABILISTIC_CLIMATOLOGY_END_YEAR] [--probabilistic_climatology_hour_interval PROBABILISTIC_CLIMATOLOGY_HOUR_INTERVAL] - [--add_land_region] + [--regions REGIONS] + [--lsm_dataset LSM_DATASET] + [--compute_seeps] [--eval_configs EVAL_CONFIGS] [--ensemble_dim ENSEMBLE_DIM] [--rename_variables RENAME_VARIABLES] @@ -53,7 +55,9 @@ _Command options_: for probabilistic climatology * `--probabilistic_climatology_hour_interval`: Hour interval to compute probabilistic climatology. Default: 6 -* `--add_land_region`: Add land-only evaluation. `land_sea_mask` must be in observation dataset. +* `--regions`: Comma delimited list of predefined regions to evaluate. "all" for all predefined regions. +* `--lsm_dataset`: Dataset containing land-sea-mask at same resolution of datasets to be evaluated. Required if region with land-sea-mask is picked. If None, defaults to observation dataset. +* `--compute_seeps`: Compute SEEPS for total_precipitation. * `--eval_configs`: Comma-separated list of evaluation configs to run. See details below. Default: `deterministic` * `--ensemble_dim`: Ensemble dimension name for ensemble metrics. Default: `number`. * `--rename_variables`: Dictionary of variable to rename to standard names. E.g. {"2t": "2m_temperature"} @@ -70,59 +74,8 @@ _Command options_: * `--beam_runner`: Beam runner * `--fanout`: Beam CombineFn fanout. Might be required for large dataset. Default: `None` -*Predefined evaluation configs* +[Predefined evaluation configs](https://github.com/google-research/weatherbench2/blob/main/scripts/evaluate.py#L389) -``` -deterministic_metrics = { - 'rmse': RMSE(wind_vector_rmse=_wind_vector_rmse()), - 'mse': MSE(), - 'acc': ACC(climatology=climatology), -} - -eval_configs = { - 'deterministic': config.Eval( - metrics=deterministic_metrics, - against_analysis=False, - regions=regions, - derived_variables=derived_variables, - evaluate_persistence=EVALUATE_PERSISTENCE.value, - evaluate_climatology=EVALUATE_CLIMATOLOGY.value, - ), - 'deterministic_spatial': config.Eval( - metrics={'bias': SpatialBias(), 'mse': SpatialMSE()}, - against_analysis=False, - derived_variables=derived_variables, - evaluate_persistence=EVALUATE_PERSISTENCE.value, - evaluate_climatology=EVALUATE_CLIMATOLOGY.value, - ), - 'deterministic_temporal': config.Eval( - metrics=deterministic_metrics, - against_analysis=False, - regions=regions, - derived_variables=derived_variables, - evaluate_persistence=EVALUATE_PERSISTENCE.value, - evaluate_climatology=EVALUATE_CLIMATOLOGY.value, - temporal_mean=False, - ), - 'probabilistic': config.Eval( - metrics={ - 'crps': CRPS(ensemble_dim=ENSEMBLE_DIM.value), - 'ensemble_mean_rmse': EnsembleMeanRMSE( - ensemble_dim=ENSEMBLE_DIM.value - ), - 'ensemble_stddev': EnsembleStddev( - ensemble_dim=ENSEMBLE_DIM.value - ), - }, - against_analysis=False, - derived_variables=derived_variables, - evaluate_probabilistic_climatology=EVALUATE_PROBABILISTIC_CLIMATOLOGY.value, - probabilistic_climatology_start_year=PROBABILISTIC_CLIMATOLOGY_START_YEAR.value, - probabilistic_climatology_end_year=PROBABILISTIC_CLIMATOLOGY_END_YEAR.value, - probabilistic_climatology_hour_interval=PROBABILISTIC_CLIMATOLOGY_HOUR_INTERVAL.value, - ), -} -``` *Example* @@ -149,6 +102,7 @@ This scripts computes a day-of-year, hour-of-day climatology with optional smoot usage: compute_climatology.py [-h] [--input_path INPUT_PATH] [--output_path OUTPUT_PATH] + [--frequency FREQUENCY] [--hour_interval HOUR_INTERVAL] [--window_size WINDOW_SIZE] [--start_year START_YEAR] @@ -160,7 +114,7 @@ usage: compute_climatology.py [-h] [--add_statistic_suffix] [--method METHOD] [--seeps_dry_threshold_mm SEEPS_DRY_THRESHOLD_MM] - [--beam_runner BEAM_RUNNER] + [--runner RUNNER] ``` @@ -168,6 +122,7 @@ _Command options_: * `--input_path`: (required) Input Zarr path * `--output_path`: (required) Output Zarr path +* `--frequency`: Frequency of the computed climatology. "hourly": Compute the climatology per day of year and hour of day. "daily": Compute the climatology per day of year. * `--hour_interval`: Which intervals to compute hourly climatology for. Default: `1` * `--window_size`: Window size in days to average over. Default: `61` * `--start_year`: Inclusive start year of climatology. Default: `1990` @@ -179,7 +134,7 @@ _Command options_: * `--add_statistic_suffix`: Add suffix of statistic to variable name. Required for >1 statistic. * `--method`: Computation method to use. "explicit": Stack years first, apply rolling and then compute weighted statistic over (year, rolling_window). "fast": Compute statistic over day-of-year first and then apply weighted smoothing. Mathematically equivalent for mean but different for nonlinear statistics. Default: `explicit` * `--seeps_dry_threshold_mm`: Dict defining dry threshold for SEEPS quantile computation for each precipitation variable. In mm. Default: `"{'total_precipitation_24hr':0.25, 'total_precipitation_6hr':0.1}"` -* `--beam_runner`: Beam runner. Use `DirectRunner` for local execution. +* `--runner`: Beam runner. Use `DirectRunner` for local execution. *Example* @@ -203,9 +158,11 @@ Computes derived variables, adds them to the original dataset and saves it as a usage: compute_derived_variables.py [-h] [--input_path INPUT_PATH] [--output_path OUTPUT_PATH] - [--derived_variables DERIVED_VARIABLES] + [--derived_variables DERIVED_VARIABLES] + [--preexisting_variables_to_remove PREEXISTING_VARIABLES_TO_REMOVE] [--raw_tp_name RAW_TP_NAME] [--rename_raw_tp_name] + [--rename_variables RENAME_VARIABLES] [--working_chunks WORKING_CHUNKS] [--rechunk_itemsize RECHUNK_ITEMSIZE] [--max_mem_gb MAX_MEM_GB] @@ -216,9 +173,11 @@ _Command options_: * `--input_path`: (required) Input Zarr path * `--output_path`: (required) Output Zarr path -* `--derived_variables`: (required) Comma delimited list of derived variables to compute. Default: `wind_speed,10m_wind_speed,total_precipitation_6hr,total_precipitation_24hr` +* `--derived_variables`: Comma delimited list of derived variables to compute. By default, tries to compute all derived variables. +* `--preexisting_variables_to_remove`: Comma delimited list of variables to remove from the source data, if they exist. This is useful to allow for overriding source dataset variables with dervied variables of the same name. * `--raw_tp_name`: Raw name of total precipitation variables. Use "total_precipitation_6hr" for backwards compatibility. * `--rename_raw_tp_name`: Rename raw tp name to "total_precipitation". +* `--rename_variables`: Dictionary of variable to rename to standard names. E.g. {"2t":"2m_temperature"} * `--working_chunks`: Chunk sizes overriding input chunks to use for computing aggregations e.g., "longitude=10,latitude=10". No need to add prediction_timedelta=-1, this is automatically added for aggregation variables. Default: `None`, i.e. input chunks * `--rechunk_itemsize`: Itemsize for rechunking. Default: `4` * `--max_mem_gb`: Max memory for rechunking in GB. Default: `1` @@ -250,6 +209,7 @@ usage: compute_zonal_energy_spectrum.py [-h] [--time_stop TIME_STOP] [--levels LEVELS] [--averaging_dims AVERAGING_DIMS] + [--fanout FANOUT] [--runner RUNNER] ``` @@ -263,12 +223,13 @@ _Command options_: * `--time_stop`: ISO 8601 timestamp (inclusive) at which to stop evaluation. Default: `2020-12-31` * `--levels`: Comma delimited list of pressure levels to compute spectra on. If empty, compute on all levels of --input_path. Default: `500,700,850` * `--averaging_dims`: Comma delimited list of variables to average over. If empty, do not average. Default: `time` +* `--fanout`: Beam CombineFn fanout. Might be required for large dataset. * `--runner`: Beam runner. Use `DirectRunner` for local execution. *Example* ```bash -python compute_zonal_power_spectrum.py \ +python compute_zonal_energy_spectrum.py \ --input_path=gs://weatherbench2/datasets/era5/1959-2022-6h-240x121_equiangular_with_poles_conservative.zarr \ --output_path=PATH \ --time_start=2020 \ @@ -284,6 +245,9 @@ To use the ensemble mean in deterministic evaluation, we first must compute the usage: compute_ensemble_mean.py [-h] [--input_path INPUT_PATH] [--output_path OUTPUT_PATH] + [--time_dim TIME_DIM] + [--time_start TIME_START] + [--time_stop TIME_STOP] [--realization_name REALIZATION_NAME] [--runner RUNNER] ``` @@ -292,6 +256,9 @@ _Command options_: * `--input_path`: (required) Input Zarr path * `--output_path`: (required) Output Zarr path +* `--time_dim`: Name for the time dimension to slice data on. Default: `time` +* `--time_start`: ISO 8601 timestamp (inclusive) at which to start evaluation. Default: `2020-01-01'` +* `--time_stop`: ISO 8601 timestamp (inclusive) at which to stop evaluation. Default: `2020-12-31` * `--realization_name`: Name of realization/member/number dimension. Default: `realization` * `--runner`: Beam runner. Use `DirectRunner` for local execution. @@ -384,6 +351,83 @@ python regrid.py \ --regridding_method=conservative ``` +(compute_averages)= +## Compute averages +Computes average over dimensions of a forecast dataset. + +``` +usage: compute_averages.py [-h] + [--input_path INPUT_PATH] + [--output_path OUTPUT_PATH] + [--output_chunks OUTPUT_CHUNKS] + [--time_dim TIME_DIM] + [--time_start TIME_START] + [--time_stop TIME_STOP] + [--variables VARIABLES] + [--fanout FANOUT] + [--runner RUNNER] +``` + +_Command options_: + +* `--input_path`: (required) Input Zarr path +* `--output_path`: (required) Output Zarr path +* `--time_dim`: Name for the time dimension to slice data on. Default: `time` +* `--time_start`: ISO 8601 timestamp (inclusive) at which to start evaluation. Default: `2020-01-01'` +* `--time_stop`: ISO 8601 timestamp (inclusive) at which to stop evaluation. Default: `2020-12-31` +* `--variables`: Comma delimited list of data variables to include in output. If empty, compute on all data_vars of --input_path. +* `--fanout`: Beam CombineFn fanout. Might be required for large dataset. +* `--runner`: Beam runner. Use `DirectRunner` for local execution. + +*Example* + +```bash +python compute_averages.py \ + --input_path=gs://weatherbench2/datasets/era5/1959-2022-6h-64x32_equiangular_with_poles_conservative.zarr \ + --output_path=gs://$BUCKET/datasets/era5/$USER/temperature-vertical-profile.zarr \ + --runner=DataflowRunner \ + -- \ + --project=$PROJECT \ + --averaging_dims=time,longitude \ + --variables=temperature \ + --temp_location=gs://$BUCKET/tmp/ \ + --setup_file=./setup.py \ + --requirements_file=./scripts/dataflow-requirements.txt \ + --job_name=compute-vertical-profile-$USER +``` + +(resample_daily)= +## Resample daily +Computes average over dimensions of a forecast dataset. + +``` +usage: resample_daily.py [-h] + [--input_path INPUT_PATH] + [--output_path OUTPUT_PATH] + [--method METHOD] + [--period PERIOD] + [--statistics STATISTICS] + [--add_statistic_suffix] + [--num_threads NUM_THREADS] + [--start_year START_YEAR] + [--end_year END_YEAR] + [--working_chunks WORKING_CHUNKS] + [--beam_runner BEAM_RUNNER] +``` + +_Command options_: + +* `--input_path`: (required) Input Zarr path +* `--output_path`: (required) Output Zarr path +* `--method`: resample or roll +* `--period`: int + d or w +* `--statistics`: Output resampled time statistics, from "mean", "min", or "max". +* `--add_statistic_suffix`: Add suffix of statistic to variable name. Required for >1 statistic. +* `--num_threads`: Number of chunks to load in parallel per worker. +* `--start_year`: Start year (inclusive). +* `--end_year`: End year (inclusive). +* `--working_chunks`: Spatial chunk sizes to use during time downsampling, e.g., "longitude=10,latitude=10". They may not include "time". +* `--beam_runner`: Beam runner. Use `DirectRunner` for local execution. ## Expand climatology diff --git a/docs/source/data-guide.ipynb b/docs/source/data-guide.ipynb index c34d320..59679d7 100644 --- a/docs/source/data-guide.ipynb +++ b/docs/source/data-guide.ipynb @@ -21,30 +21,33 @@ "\n", "One core part of WeatherBench 2 are ready-to-use, cloud-based datasets. This page lists and describes all the available datasets.\n", "\n", - "The datasets are stored in this public Google Cloud bucket: [`gs://weatherbench2/datasets`](https://console.cloud.google.com/storage/browser/weatherbench2). **Please also check the LICENSE files for each dataset in the respective GCS buckets.** Some datasets allow commercial use. Others only permit research use.\n", + "The datasets are stored in this public Google Cloud bucket: [`gs://weatherbench2/datasets`](https://console.cloud.google.com/storage/browser/weatherbench2). \n", + "\n", + "**Please also check the LICENSE files for each dataset in the respective GCS buckets.** Some datasets allow commercial use. Others only permit research use.\n", "\n", "### A note on resolutions\n", "\n", "We provide the datasets at different resolutions. All files will have the number of longitude X latitude grid points in their filename, e.g. `64x32`. For the WeatherBench 2 paper, all evaluation was done at `240x121` = 1.5 degree resolution. All datasets were regridded using first-order conservative regridding, i.e., with weights proportional to the area of overlap between grid cells on the original and desired grids.\n", "\n", - "The ERA5 resolution files (`1440x721` = 0.25 degrees) contain the poles, i.e. -90 and 90 degree latitude. Most regridded files also do, denoted with `with_poles`. The `512x256` files do not contain the pole grid points.\n", + "The `1440x721` (= 0.25 degrees) and `240x121` files contain the poles, i.e. -90 and 90 degree latitude, denoted with `with_poles`. `64x32` files do not contain the poles to ensure equal spacing.\n", "\n", "## Ground-truth datasets\n", "\n", "### ERA5 \n", "\n", - "Our ERA5 datasets were downloaded from the [Copernicus Climate Data Store](https://cds.climate.copernicus.eu/) and have a time range from 1959 to 2022 (incl.). The data here have been downsampled to 6h and 13 levels, even though a raw hourly dataset with 37 levels is also available at `gs://weatherbench2/datasets/era5/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2`\n", + "Our ERA5 datasets were downloaded from the [Copernicus Climate Data Store](https://cds.climate.copernicus.eu/) and have a time range from 1959 to 2023 (incl.). The data here have been downsampled to 6h and 13 levels, even though a raw hourly dataset with 37 levels is also available at `gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr`\n", "\n", "Location: [`gs://weatherbench2/datasets/era5/`](https://console.cloud.google.com/storage/browser/weatherbench2/datasets/era5/)\n", "\n", "Files:\n", - "* `1959-2022-6h-1440x721.zarr`\n", - "* `1959-2022-6h-512x256_equiangular_conservative.zarr`\n", - "* `1959-2022-6h-240x121_equiangular_with_poles_conservative.zarr`\n", - "* `1959-2022-6h-128x64_equiangular_with_poles_conservative.zarr`\n", - "* `1959-2022-6h-64x32_equiangular_with_poles_conservative.zarr`\n", + "* `1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr`\n", + "* `1959-2023_01_10-wb13-6h-1440x721_with_derived_variables.zarr`\n", + "* `1959-2023_01_10-6h-240x121_equiangular_with_poles_conservative.zarr`\n", + "* `1959-2023_01_10-6h-64x32_equiangular_conservative.zarr`\n", + "\n", + "*Note: Older version of the ERA5 files exist in the bucket to ensure continuity.*\n", "\n", - "See output below for a list of variables. Wind speed was derived using [this method](https://weatherbench2.readthedocs.io/en/latest/_autosummary/weatherbench2.derived_variables.WindSpeed.html#weatherbench2.derived_variables.WindSpeed)." + "See output below for a list of variables. The file also contains several derived variables which were computed using these [methods](https://weatherbench2.readthedocs.io/en/latest/api.html#derived-variables)." ] }, { @@ -53,14 +56,6 @@ "id": "972c9516-847b-4813-9618-1ce54b6dad40", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/miniconda3/envs/weatherbench2/lib/python3.11/site-packages/google/auth/_default.py:79: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. We recommend you rerun `gcloud auth application-default login` and make sure a quota project is added. Or you can use service accounts instead. For more information about service accounts, see https://cloud.google.com/docs/authentication/\n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" - ] - }, { "data": { "text/html": [ @@ -428,7 +423,7 @@ " fill: currentColor;\n", "}\n", "
<xarray.Dataset>\n",
-       "Dimensions:                                           (time: 92044,\n",
+       "Dimensions:                                           (time: 93544,\n",
        "                                                       latitude: 721,\n",
        "                                                       longitude: 1440,\n",
        "                                                       level: 13)\n",
@@ -437,25 +432,25 @@
        "  * level                                             (level) int64 50 ... 1000\n",
        "  * longitude                                         (longitude) float32 0.0...\n",
        "  * time                                              (time) datetime64[ns] 1...\n",
-       "Data variables: (12/38)\n",
+       "Data variables: (12/61)\n",
        "    10m_u_component_of_wind                           (time, latitude, longitude) float32 dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>\n",
        "    10m_v_component_of_wind                           (time, latitude, longitude) float32 dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>\n",
        "    10m_wind_speed                                    (time, latitude, longitude) float32 dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>\n",
+       "    2m_dewpoint_temperature                           (time, latitude, longitude) float32 dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>\n",
        "    2m_temperature                                    (time, latitude, longitude) float32 dask.array<chunksize=(1, 721, 1440), meta=np.ndarray>\n",
-       "    angle_of_sub_gridscale_orography                  (latitude, longitude) float32 dask.array<chunksize=(721, 1440), meta=np.ndarray>\n",
-       "    anisotropy_of_sub_gridscale_orography             (latitude, longitude) float32 dask.array<chunksize=(721, 1440), meta=np.ndarray>\n",
+       "    ageostrophic_wind_speed                           (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 13, 721, 1440), meta=np.ndarray>\n",
        "    ...                                                ...\n",
-       "    type_of_high_vegetation                           (latitude, longitude) float32 dask.array<chunksize=(721, 1440), meta=np.ndarray>\n",
-       "    type_of_low_vegetation                            (latitude, longitude) float32 dask.array<chunksize=(721, 1440), meta=np.ndarray>\n",
-       "    u_component_of_wind                               (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 13, 721, 1440), meta=np.ndarray>\n",
-       "    v_component_of_wind                               (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 13, 721, 1440), meta=np.ndarray>\n",
-       "    vertical_velocity                                 (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 13, 721, 1440), meta=np.ndarray>\n",
-       "    wind_speed                                        (time, level, latitude, longitude) float32 dask.array<chunksize=(1, 13, 721, 1440), meta=np.ndarray>