diff --git a/EmulatorShowcaseSarah/0preprocessing/1-ERA5-land2012.ipynb b/EmulatorShowcaseSarah/0preprocessing/1-ERA5-land2012.ipynb deleted file mode 100644 index 7c451bd..0000000 --- a/EmulatorShowcaseSarah/0preprocessing/1-ERA5-land2012.ipynb +++ /dev/null @@ -1,2027 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "9a3997ce-701b-49b2-ad82-b6532a8e6697", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import xarray as xr\n", - "import glob\n", - "\n", - "from dask.distributed import Client, LocalCluster\n", - "from dask_jobqueue import SLURMCluster" - ] - }, - { - "cell_type": "markdown", - "id": "c43a1dc9-90ac-469a-881a-d166d1218601", - "metadata": {}, - "source": [ - "# ERA5 Land data: from NetCDF3 to Zarr\n", - "\n", - "In this notebook we load ERA5 Land data stored in a collection of NetCDF3 files and we save it as a (chunked) Zarr store. It is global." - ] - }, - { - "cell_type": "markdown", - "id": "203afa62-9c85-4dcc-b9c8-6f5d120635eb", - "metadata": {}, - "source": [ - "## Input variables" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "15c683a7-a045-40f6-80a3-a1b227ea8ed9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "year = 2012" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "bd3c168f-e425-4485-b4c2-88af1b3fec72", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ROOT_DIR = '/gpfs/work2/0/ttse0619'\n", - "ERA5_DIR = (\n", - " f'{ROOT_DIR}/qianqian/global_data_Qianqian/'\n", - " f'1input_data/{year}global/era5land/'\n", - ")\n", - "OUT_DIR = (\n", - " f'{ROOT_DIR}/qianqian/global_data_Qianqian/'\n", - " f'1input_data/{year}global/era5land'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "c746f1fb-b102-4b5f-804f-a26fe265fd79", - "metadata": {}, - "source": [ - "## Setup Dask cluster\n", - "\n", - "NOTE: when working with NetCDF files (and the netcdf4 library) it is much better to work with many processes and few threads per process: netcdf4 can only read from one thread per process." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "dde3dbe9-9e06-44f0-a6a2-111056281e6a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# cluster = LocalCluster(n_workers=16, threads_per_worker=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "c1b603ec-cd35-4f5d-ab2b-9b7981afdfa1", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/qiahan/.conda/envs/mamba/envs/emulator/lib/python3.9/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", - "Perhaps you already have a cluster running?\n", - "Hosting the HTTP server on port 36437 instead\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "cluster = SLURMCluster(\n", - " name='dask-worker',\n", - " cores=16,\n", - " processes=16,\n", - " queue='fat',\n", - " memory='120GiB',\n", - " local_directory='$TMPDIR',\n", - " walltime='3:00:00'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "a5b5ad3a-4541-433c-b5cc-1653ec793106", - "metadata": {}, - "source": [ - "We use in total 4 * 16 = 64 single-threaded workers, and ~480 GiB total memory: " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6f9778da-d8f5-402c-8490-d9de2eaedecd", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cluster.scale(jobs=4)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0fd4eb4b-4a9c-4bc6-9726-5046c3c84ed4", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
Client-dcdde942-be9c-11ee-a0b4-0a94efbd6637
\n", - "| Connection method: Cluster object | \n", - "Cluster type: dask_jobqueue.SLURMCluster | \n", - " \n", - "
| \n", - " Dashboard: /proxy/36437/status\n", - " | \n", - "\n", - " |
dask-worker
\n", - "| \n", - " Dashboard: /proxy/36437/status\n", - " | \n", - "\n", - " Workers: 0\n", - " | \n", - "
| \n", - " Total threads: 0\n", - " | \n", - "\n", - " Total memory: 0 B\n", - " | \n", - "
Scheduler-b02ece05-fd43-43f1-af69-6c0ac646d75e
\n", - "| \n", - " Comm: tcp://145.136.63.1:43843\n", - " | \n", - "\n", - " Workers: 0\n", - " | \n", - "
| \n", - " Dashboard: /proxy/36437/status\n", - " | \n", - "\n", - " Total threads: 0\n", - " | \n", - "
| \n", - " Started: Just now\n", - " | \n", - "\n", - " Total memory: 0 B\n", - " | \n", - "
<xarray.Dataset>\n", - "Dimensions: (longitude: 3600, latitude: 1801, time: 8784)\n", - "Coordinates:\n", - " * longitude (longitude) float32 0.0 0.1 0.2 0.3 ... 359.6 359.7 359.8 359.9\n", - " * latitude (latitude) float32 90.0 89.9 89.8 89.7 ... -89.8 -89.9 -90.0\n", - " * time (time) datetime64[ns] 2012-01-01 ... 2012-12-31T23:00:00\n", - "Data variables:\n", - " d2m (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - " sp (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - " ssrd (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - " strd (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - " t2m (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - " tp (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - " u10 (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - " v10 (time, latitude, longitude) float32 dask.array<chunksize=(750, 250, 250), meta=np.ndarray>\n", - "Attributes:\n", - " Conventions: CF-1.6\n", - " history: 2023-06-15 00:16:00 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...
Client-6d6d96d6-e787-11ee-8805-b8cef6791500
\n", - "| Connection method: Cluster object | \n", - "Cluster type: dask_jobqueue.SLURMCluster | \n", - " \n", - "
| \n", - " Dashboard: /proxy/46315/status\n", - " | \n", - "\n", - " |
dask-worker
\n", - "| \n", - " Dashboard: /proxy/46315/status\n", - " | \n", - "\n", - " Workers: 0\n", - " | \n", - "
| \n", - " Total threads: 0\n", - " | \n", - "\n", - " Total memory: 0 B\n", - " | \n", - "
Scheduler-45fd0fdb-04db-4594-97fe-632c1568f41f
\n", - "| \n", - " Comm: tcp://145.136.63.41:40417\n", - " | \n", - "\n", - " Workers: 0\n", - " | \n", - "
| \n", - " Dashboard: /proxy/46315/status\n", - " | \n", - "\n", - " Total threads: 0\n", - " | \n", - "
| \n", - " Started: Just now\n", - " | \n", - "\n", - " Total memory: 0 B\n", - " | \n", - "
<xarray.DataArray 'LAI' (time: 38, lat: 15680, lon: 40320)>\n", - "dask.array<concatenate, shape=(38, 15680, 40320), dtype=float32, chunksize=(1, 15680, 40320), chunktype=numpy.ndarray>\n", - "Coordinates:\n", - " * lon (lon) float64 -180.0 -180.0 -180.0 -180.0 ... 180.0 180.0 180.0\n", - " * lat (lat) float64 80.0 79.99 79.98 79.97 ... -59.97 -59.98 -59.99\n", - " * time (time) datetime64[ns] 2017-12-31 2018-01-10 ... 2019-01-10\n", - "Attributes:\n", - " grid_mapping: crs\n", - " valid_range: [ 0 210]\n", - " long_name: Leaf Area Index 1km\n", - " standard_name: leaf_area_index\n", - " units: \n", - " _FillValue: nan
<xarray.Dataset>\n", - "Dimensions: (time: 38, latitude: 1801, longitude: 3600)\n", - "Coordinates:\n", - " * time (time) datetime64[ns] 2017-12-31 2018-01-10 ... 2019-01-10\n", - " * latitude (latitude) float64 90.0 89.9 89.8 89.7 ... -89.8 -89.9 -90.0\n", - " * longitude (longitude) float64 0.0 0.1 0.2 0.3 ... 359.6 359.7 359.8 359.9\n", - "Data variables:\n", - " LAI (time, latitude, longitude) float64 dask.array<chunksize=(38, 250, 250), meta=np.ndarray>
<xarray.Dataset>\n", - "Dimensions: (latitude: 469, longitude: 690)\n", - "Coordinates:\n", - " band int64 ...\n", - " * latitude (latitude) float32 81.8 81.7 81.6 81.5 ... 35.3 35.2 35.1 35.0\n", - " * longitude (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n", - "Data variables:\n", - " hc (latitude, longitude) float32 dask.array<chunksize=(250, 250), meta=np.ndarray>\n", - " spatial_ref int64 ...
<xarray.Dataset>\n", - "Dimensions: (band: 1, latitude: 469, longitude: 690)\n", - "Coordinates:\n", - " * band (band) int64 1\n", - " * latitude (latitude) float32 81.8 81.7 81.6 81.5 ... 35.3 35.2 35.1 35.0\n", - " * longitude (longitude) float32 0.0 0.1 0.2 0.3 0.4 ... 68.6 68.7 68.8 68.9\n", - " spatial_ref int64 ...\n", - "Data variables:\n", - " vcmax (latitude, longitude, band) float32 dask.array<chunksize=(250, 250, 1), meta=np.ndarray>
Client-206d978f-086d-11ef-b10a-b8cef6790e70
\n", - "| Connection method: Cluster object | \n", - "Cluster type: dask_jobqueue.SLURMCluster | \n", - " \n", - "
| \n", - " Dashboard: /proxy/45693/status\n", - " | \n", - "\n", - " |
dask-worker
\n", - "| \n", - " Dashboard: /proxy/45693/status\n", - " | \n", - "\n", - " Workers: 0\n", - " | \n", - "
| \n", - " Total threads: 0\n", - " | \n", - "\n", - " Total memory: 0 B\n", - " | \n", - "
Scheduler-43930a6f-73c2-4e0f-be18-0cc37b58dbf5
\n", - "| \n", - " Comm: tcp://145.136.63.38:38245\n", - " | \n", - "\n", - " Workers: 0\n", - " | \n", - "
| \n", - " Dashboard: /proxy/45693/status\n", - " | \n", - "\n", - " Total threads: 0\n", - " | \n", - "
| \n", - " Started: Just now\n", - " | \n", - "\n", - " Total memory: 0 B\n", - " | \n", - "
<xarray.Dataset>\n", - "Dimensions: (time: 264, latitude: 469, longitude: 688)\n", - "Coordinates:\n", - " * latitude (latitude) float64 81.8 81.7 81.6 81.5 ... 35.3 35.2 35.1 35.0\n", - " * longitude (longitude) float64 0.1 0.2 0.3 0.4 0.5 ... 68.5 68.6 68.7 68.8\n", - " * time (time) datetime64[ns] 2014-01-31 ... 2014-02-10T23:00:00\n", - "Data variables:\n", - " Actot (time, latitude, longitude) float64 dask.array<chunksize=(125, 200, 200), meta=np.ndarray>\n", - " Gtot (time, latitude, longitude) float64 dask.array<chunksize=(125, 200, 200), meta=np.ndarray>\n", - " Htot (time, latitude, longitude) float64 dask.array<chunksize=(125, 200, 200), meta=np.ndarray>\n", - " LEtot (time, latitude, longitude) float64 dask.array<chunksize=(125, 200, 200), meta=np.ndarray>\n", - " Rntot (time, latitude, longitude) float64 dask.array<chunksize=(125, 200, 200), meta=np.ndarray>\n", - " SIF685 (time, latitude, longitude) float64 dask.array<chunksize=(125, 200, 200), meta=np.ndarray>\n", - " SIF740 (time, latitude, longitude) float64 dask.array<chunksize=(125, 200, 200), meta=np.ndarray>\n", - " spatial_ref int64 ...