Loading data from the catalog#

Long story short:#

import intake
try:
    import outtake
except:
    import sys
    print ("""Could not load outtake - tape downloads might not work. Try adding

module use /work/k20200/k202134/hsm-tools/outtake/module
module load hsm-tools/unstable

to your ~./kernel_env file""", file=sys.stderr)


catalog_file = "/work/ka1081/Catalogs/dyamond-nextgems.json"  # nextGEMS and DYAMOND Winter
cat = intake.open_esm_datastore(catalog_file)
hits = cat.search(simulation_id="ngc2009", variable_id="tas", frequency="30minute")
dataset_dict = hits.to_dataset_dict(cdf_kwargs={"chunks": {"time": 1}})
keys = list(dataset_dict.keys())
dataset = dataset_dict[keys[0]]
dataset.tas.isel(time=1).max().values

# use get_from_cat from below to search a catalog

Loading the catalog#

The intake-esm package provides a tool to access big amounts of data, without having to worry about where it comes from. We will give you a short overview of how to do use the catalog to your advantage. The root of the intake catalog, is a ‘.json’ file.

import pandas as pd

pd.set_option("max_colwidth", None)  # makes the tables render better

import intake

try:
    import outtake
except:
    import sys

    print(
        """Could not load outtake - tape downloads might not work. Try adding
    
module use /work/k20200/k202134/hsm-tools/outtake/module
module load hsm-tools/unstable

to your ~./kernel_env file""",
        file=sys.stderr,
    )


def get_from_cat(catalog, columns):
    """A helper function for inspecting an intake catalog.

    Call with the catalog to be inspected and a list of columns of interest."""
    import pandas as pd
    pd.set_option("max_colwidth", None) # makes the tables render better

    df = catalog.df.copy()  # copy so we don't mutate the original

    for col in columns:
        # convert python lists to tuples which are hashable which is needed to call "drop_duplicates()" later on.
        df[col] = df[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)

    return (
        df[columns]
        .drop_duplicates()
        .sort_values(columns)
        .reset_index(drop=True)
    )

catalog_file = "/work/ka1081/Catalogs/dyamond-nextgems.json"

cat = intake.open_esm_datastore(catalog_file)
cat

/work/k20200/k202134/Catalogs/dng-merged catalog with 167 dataset(s) from 120310 asset(s):

	unique
variable_id	643
project	2
institution_id	13
source_id	21
experiment_id	5
simulation_id	16
realm	6
frequency	16
time_reduction	5
grid_label	11
level_type	6
time_min	3153
time_max	7000
grid_id	16
format	2
uri	120044

The meanings of the categories are:

Info	Description
variable_id	Shortname of variables.
project	Larger project the simulation belongs to.
source_id	Model name.
experiment_id	Class of experiment
simulation_id	Id of the run.
realm	oceanic or atmospheric data
frequency	Frequency in time of datapoints.
time_reduction	Average/Instantaneous/…
grid_label	Identifier for horizontal gridtype.
level_type	Identifier for vertical gridtype.
time_min	Starting time for a specific file.
time_max	End of time covered by a specific file.
grid_id	Identifier of horizontal grid.
uri	Uniform resource identifier, location of data files.

Searching the catalog#

You can access the underlying pandas dataframe with “cat.df”. Here we show the first 2 entries with head():

cat.df.head(n=2)

	variable_id	project	institution_id	source_id	experiment_id	simulation_id	realm	frequency	time_reduction	grid_label	level_type	time_min	time_max	grid_id	format	uri
0	(c, l, i, v, i)	DYAMOND_WINTER	CAMS	GRIST-5km	DW-ATM	r1i1p1f1	atmos	15min	unkonwn	gn	2d	2020-01-20T00:00:00.000	2020-01-20T23:45:00.000	not_implemented	netcdf	/work/ka1081/DYAMOND_WINTER/CAMS/GRIST-5km/DW-ATM/atmos/15min/clivi/r1i1p1f1/2d/gn/clivi_15min_GRIST-5km_DW-ATM_r1i1p1f1_2d_gn_20200120000000-20200120234500.nc
1	(c, l, t)	DYAMOND_WINTER	CAMS	GRIST-5km	DW-ATM	r1i1p1f1	atmos	15min	unkonwn	gn	2d	2020-01-20T00:00:00.000	2020-01-20T23:45:00.000	not_implemented	netcdf	/work/ka1081/DYAMOND_WINTER/CAMS/GRIST-5km/DW-ATM/atmos/15min/clt/r1i1p1f1/2d/gn/clt_15min_GRIST-5km_DW-ATM_r1i1p1f1_2d_gn_20200120000000-20200120234500.nc

To reduce the output, we have defined a helper function in the header of this document. We can use it to get an overview of projects, experiments, and models in the catalog.

get_from_cat(cat, ["project", "experiment_id", "source_id", "simulation_id"])

	project	experiment_id	source_id	simulation_id
0	DYAMOND_WINTER	DW-ATM	ARPEGE-NH-2km	r1i1p1f1
1	DYAMOND_WINTER	DW-ATM	GEM	r1i1p1f1
2	DYAMOND_WINTER	DW-ATM	GEOS-1km	r1i1p1f1
3	DYAMOND_WINTER	DW-ATM	GEOS-3km	r1i1p1f1
4	DYAMOND_WINTER	DW-ATM	GRIST-5km	r1i1p1f1
5	DYAMOND_WINTER	DW-ATM	ICON-NWP-2km	r1i1p1f1
6	DYAMOND_WINTER	DW-ATM	ICON-SAP-5km	dpp0014
7	DYAMOND_WINTER	DW-ATM	MPAS-3km	r1i1p1f1
8	DYAMOND_WINTER	DW-ATM	SCREAM-3km	r1i1p1f1
9	DYAMOND_WINTER	DW-ATM	SHiELD-3km	r1i1p1f1
10	DYAMOND_WINTER	DW-ATM	UM-5km	r1i1p1f1
11	DYAMOND_WINTER	DW-ATM	gSAM-4km	r1i1p1f1
12	DYAMOND_WINTER	DW-CPL	GEOS-6km	r1i1p1f1
13	DYAMOND_WINTER	DW-CPL	ICON-SAP-5km	dpp0029
14	DYAMOND_WINTER	DW-CPL	ICON-SAP-5km	r1i1p1f1
15	DYAMOND_WINTER	DW-CPL	IFS-4km	r1i1p1f1
16	DYAMOND_WINTER	DW-CPL	IFS-9km	r1i1p1f1
17	nextGEMS	Cycle1	IFS-FESOM2-4km	hlq0
18	nextGEMS	Cycle1	IFS-NEMO-4km	hmrt
19	nextGEMS	Cycle1	IFS-NEMO-9km	hmt0
20	nextGEMS	Cycle1	IFS-NEMO-DEEPon-4km	hmwz
21	nextGEMS	Cycle2-alpha	ICON-ESM	dpp0066
22	nextGEMS	Cycle2-alpha	ICON-ESM	dpp0067
23	nextGEMS	nextgems_cycle2	ICON-ESM	ngc2009
24	nextGEMS	nextgems_cycle2	ICON-ESM	ngc2012
25	nextGEMS	nextgems_cycle2	ICON-ESM	ngc2013
26	nextGEMS	nextgems_cycle2	IFS-FESOM	HQYS
27	nextGEMS	nextgems_cycle2	IFS-FESOM	HR0N
28	nextGEMS	nextgems_cycle2	IFS-FESOM	HR2N
29	nextGEMS	nextgems_cycle2	IFS-FESOM	HR2N_nodeep

Let’s look into the variables of ICON in NGC2009. Detailed information about how to search the catalog can be found here.

get_from_cat(cat.search(simulation_id="ngc2009"), ["realm", "frequency", "variable_id"])

	realm	frequency	variable_id
0	atm	1day	(clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
1	atm	1day	(psl, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
2	atm	1month	(sfcwind, clivi, cllvi, cptgzvi, hfls, hfss, prlr, pr, prw, qgvi, qrvi, qsvi, rlds, rlus, rlut, rsds, rsdt, rsus, rsut, tauu, tauv, rpds_dir, rpds_dif, rvds_dif, rnds_dif)
3	atm	1month	(sfcwind, clivi, cllvi, cptgzvi, hfls, hfss, prlr, pr, prw, qgvi, qrvi, qsvi, rlds, rlus, rlut, rsds, rsdt, rsus, rsut, tauu, tauv, rpds_dir, rpds_dif, rvds_dif, rnds_dif, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
4	atm	1month	(sfcwind, clivi, cllvi, cptgzvi, hfls, hfss, prlr, pr, prw, qgvi, qrvi, qsvi, rlds, rlus, rlut, rsds, rsdt, rsus, rsut, tauu, tauv, rpds_dir, rpds_dif, rvds_dif, rnds_dif, psl, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
5	atm	1month	(ua, va, wa, ta, hus, rho, clw, cli, pfull, zghalf, zg, dzghalf)
6	atm	2hour	(phalf,)
7	atm	2minute	(fc, frland, hsurf, p, rnds_dif, rnds_dir, rsds, rvds_dif, rvds_dir, soiltype, t, u, v, w)
8	atm	30minute	(hydro_canopy_cond_limited_box, hydro_w_snow_box, hydro_snow_soil_dens_box)
9	atm	30minute	(hydro_discharge_ocean_box, hydro_drainage_box, hydro_runoff_box, hydro_transpiration_box, sse_grnd_hflx_old_box)
10	atm	30minute	(psl, ps, sit, sic, tas, ts, uas, vas, cfh_lnd)
11	atm	30minute	(sfcwind, clivi, cllvi, cptgzvi, hfls, hfss, prlr, pr, prw, qgvi, qrvi, qsvi, rlds, rlus, rlut, rsds, rsdt, rsus, rsut, tauu, tauv, rpds_dir, rpds_dif, rvds_dif, rnds_dif)
12	atm	6hour	(clw, cli, pfull)
13	atm	6hour	(hydro_w_soil_sl_box, hydro_w_ice_sl_box, sse_t_soil_sl_box)
14	atm	6hour	(ta, hus, rho)
15	atm	6hour	(ta, ua, va, clw, hus, zfull, cli, pv)
16	atm	6hour	(tas_gmean, rsdt_gmean, rsut_gmean, rlut_gmean, radtop_gmean, prec_gmean, evap_gmean, fwfoce_gmean)
17	atm	6hour	(ua, va, wa)
18	atm	fx	(zghalf, zg, dzghalf)
19	lnd	1month	(hydro_discharge_ocean_box, hydro_drainage_box, hydro_runoff_box, hydro_transpiration_box, sse_grnd_hflx_old_box, hydro_canopy_cond_limited_box, hydro_w_snow_box, hydro_snow_soil_dens_box, hydro_w_soil_sl_box, hydro_w_ice_sl_box, sse_t_soil_sl_box)
20	oce	1day	(atlantic_hfbasin, atlantic_hfl, atlantic_moc, atlantic_sltbasin, atlantic_wfl, global_hfbasin, global_hfl, global_moc, global_sltbasin, global_wfl, pacific_hfbasin, pacific_hfl, pacific_moc, pacific_sltbasin, pacific_wfl)
21	oce	1day	(atmos_fluxes_FrshFlux_Evaporation, atmos_fluxes_FrshFlux_Precipitation, atmos_fluxes_FrshFlux_Runoff, atmos_fluxes_FrshFlux_SnowFall, atmos_fluxes_HeatFlux_Latent, atmos_fluxes_HeatFlux_LongWave, atmos_fluxes_HeatFlux_Sensible, atmos_fluxes_HeatFlux_ShortWave, atmos_fluxes_HeatFlux_Total, atmos_fluxes_stress_x, atmos_fluxes_stress_xw, atmos_fluxes_stress_y, atmos_fluxes_stress_yw, conc, heat_content_seaice, heat_content_snow, heat_content_total, hi, hs, ice_u, ice_v, mlotst, Qbot, Qtop, sea_level_pressure, stretch_c, zos, verticallyTotal_mass_flux_e, Wind_Speed_10m)
22	oce	1day	(so, tke, to, u, v, w, A_tracer_v_to, A_veloc_v, heat_content_liquid_water)
23	oce	1hour	(atmos_fluxes_FrshFlux_Evaporation, atmos_fluxes_FrshFlux_Precipitation, atmos_fluxes_FrshFlux_Runoff, atmos_fluxes_FrshFlux_SnowFall, atmos_fluxes_HeatFlux_Latent, atmos_fluxes_HeatFlux_LongWave, atmos_fluxes_HeatFlux_Sensible, atmos_fluxes_HeatFlux_ShortWave, atmos_fluxes_HeatFlux_Total, atmos_fluxes_stress_x, atmos_fluxes_stress_xw, atmos_fluxes_stress_y, atmos_fluxes_stress_yw, Qbot, Qtop)
24	oce	1hour	(so, to, u, v, conc, hi, hs, ice_u, ice_v, mlotst, sea_level_pressure, stretch_c, Wind_Speed_10m, zos)
25	oce	1month	(A_tracer_v_to, tke)
26	oce	1month	(atmos_fluxes_FrshFlux_Evaporation, atmos_fluxes_FrshFlux_Precipitation, atmos_fluxes_FrshFlux_Runoff, atmos_fluxes_FrshFlux_SnowFall, atmos_fluxes_HeatFlux_Latent, atmos_fluxes_HeatFlux_LongWave, atmos_fluxes_HeatFlux_Sensible, atmos_fluxes_HeatFlux_ShortWave, atmos_fluxes_HeatFlux_Total, atmos_fluxes_stress_x, atmos_fluxes_stress_xw, atmos_fluxes_stress_y, atmos_fluxes_stress_yw, conc, heat_content_seaice, heat_content_snow, heat_content_total, hi, hs, ice_u, ice_v, mlotst, Qbot, Qtop, sea_level_pressure, stretch_c, zos, Wind_Speed_10m)
27	oce	1month	(so, tke, to, u, v, w, A_tracer_v_to, heat_content_liquid_water)
28	oce	1month	(so, to, u, v, w)
29	oce	3hour	(A_tracer_v_to, A_veloc_v, tke)
30	oce	3hour	(so, to, u, v, w)
31	oce	6hour	(total_salt, total_saltinseaice, total_saltinliquidwater, amoc26n, kin_energy_global, pot_energy_global, total_energy_global, ssh_global, sst_global, sss_global, potential_enstrophy_global, HeatFlux_Total_global, FrshFlux_Precipitation_global, FrshFlux_SnowFall_global, FrshFlux_Evaporation_global, FrshFlux_Runoff_global, FrshFlux_VolumeIce_global, FrshFlux_TotalOcean_global, FrshFlux_TotalIce_global, FrshFlux_VolumeTotal_global, totalsnowfall_global, ice_volume_nh, ice_volume_sh, ice_extent_nh, ice_extent_sh, global_heat_content, global_heat_content_solid)

Let’s look into surface air temperature (tas)

get_from_cat(
    cat.search(simulation_id="ngc2009", variable_id="tas"),
    ["realm", "frequency", "level_type", "variable_id"],
)

	realm	frequency	level_type	variable_id
0	atm	1day	ml	(clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
1	atm	1day	ml	(psl, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
2	atm	1month	ml	(sfcwind, clivi, cllvi, cptgzvi, hfls, hfss, prlr, pr, prw, qgvi, qrvi, qsvi, rlds, rlus, rlut, rsds, rsdt, rsus, rsut, tauu, tauv, rpds_dir, rpds_dif, rvds_dif, rnds_dif, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
3	atm	1month	ml	(sfcwind, clivi, cllvi, cptgzvi, hfls, hfss, prlr, pr, prw, qgvi, qrvi, qsvi, rlds, rlus, rlut, rsds, rsdt, rsus, rsut, tauu, tauv, rpds_dir, rpds_dif, rvds_dif, rnds_dif, psl, clt, evspsbl, tas, ts, rldscs, rlutcs, rsdscs, rsuscs, rsutcs)
4	atm	30minute	ml	(psl, ps, sit, sic, tas, ts, uas, vas, cfh_lnd)

hits = cat.search(simulation_id="ngc2009", variable_id="tas", frequency="30minute")
# The 1day files would have crashed the jupyter because the files are inconsistent across the run.
hits

/work/k20200/k202134/Catalogs/dng-merged catalog with 1 dataset(s) from 817 asset(s):

	unique
variable_id	9
project	1
institution_id	1
source_id	1
experiment_id	1
simulation_id	1
realm	1
frequency	1
time_reduction	1
grid_label	1
level_type	1
time_min	817
time_max	817
grid_id	1
format	1
uri	817

Note: The variable_id field still is on 9, as there are 9 variables in total in the file(s) containing tas.

Loading the Data#

When you searched the catalog and now want to access the actual data, it is time to load it.

The Option cdf_kwargs={"chunks": {"time":1}} is used, so that only reasonably sized chunks of data are loaded at a time. Your kernel WILL break if you want to load the whole set at once!

dataset_dict = hits.to_dataset_dict(cdf_kwargs={"chunks": {"time": 1}})

--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.institution_id.source_id.experiment_id.simulation_id.realm.frequency.time_reduction.grid_label.level_type'

100.00% [1/1 00:00<00:00]

We have only one dataset, to access it, we need the keys:

keys = list(dataset_dict.keys())
keys

['nextGEMS.MPI-M.ICON-ESM.nextgems_cycle2.ngc2009.atm.30minute.inst.gn.ml']

Now we can finally access the data:

dataset = dataset_dict[keys[0]]
dataset

<xarray.Dataset>
Dimensions:  (time: 36722, height: 1, ncells: 20971520)
Coordinates:
  * height   (height) float64 2.0
  * time     (time) datetime64[ns] 2020-01-20 2020-01-20T00:30:00 ... 2022-03-01
Dimensions without coordinates: ncells
Data variables:
    tas      (time, height, ncells) float32 dask.array<chunksize=(1, 1, 20971520), meta=np.ndarray>
Attributes: (12/13)
    Conventions:             CF-1.6
    institution:             Max Planck Institute for Meteorology/Deutscher W...
    number_of_grid_used:     15
    CDI:                     Climate Data Interface version 1.8.3rc (http://m...
    uuidOfHGrid:             0f1e7d66-637e-11e8-913b-51232bb4d8f9
    history:                 ./icon at 20220512 152214\n./icon at 20220512 19...
    ...                      ...
    title:                   ICON simulation
    grid_file_uri:           http://icon-downloads.mpimet.mpg.de/grids/public...
    comment:                 Sapphire Dyamond (k203123) on l10739 (Linux 4.18...
    source:                  git@gitlab.dkrz.de:icon/icon-aes.git@87a1eaded69...
    intake_esm_varname:      ['tas']
    intake_esm_dataset_key:  nextGEMS.MPI-M.ICON-ESM.nextgems_cycle2.ngc2009....

dataset.tas.isel(time=1).min().values
# the first time step just contains zeros, so we take the second by saying isel(time=1)

array(225.27545, dtype=float32)

dataset.tas.isel(time=1).max().values

array(312.81677, dtype=float32)

dataset.tas.max(dim="ncells")  # lazy evaluation - no real work is done yet.

<xarray.DataArray 'tas' (time: 36722, height: 1)>
dask.array<_nanmax_skip-aggregate, shape=(36722, 1), dtype=float32, chunksize=(1, 1), chunktype=numpy.ndarray>
Coordinates:
  * height   (height) float64 2.0
  * time     (time) datetime64[ns] 2020-01-20 2020-01-20T00:30:00 ... 2022-03-01

# evaluate if you have time to spare
# dataset.tas.max(dim="ncells").values