Phase 1#
This notebook provides an overview and generates and plots statistics about the EERIE Phase 1 production simulation data available at DKRZ’s High Performance Computer Levante. We browse through the EERIE intake catalog, search for phase 1 models, experiments and versions and collect information from a datasets:
Other phase 1 sources:
The eerie.cloud provides a browsable interface using a EERIE stac collection of the phase 1 simulations. The STAC collection is a mirror of the intake catalog.
The recent progress of the simulations is displayed here
Phase 1 simulations are a set of combinations of ESM and Experiments. For each entry in this set, we have exactly one version that is considered the phase one simulation.
IFS-FESOM2-SR: (eerie-spinup-1950, eerie-control-1950, hist-1950)
ICON-ESM-ER: (eerie-spinup-1950, eerie-control-1950, hist-1950)
IFS-AMIP-TCO399: (hist, hist-c-0-a-lr20, hist-c-lr20-a-0)
IFS-AMIP-TCO1279: (hist, hist-c-0-a-lr20)
IFS-NEMO: (eerie-spinup-1950)
hadgem3-gc5-n640-orca12 (piControl)
hadgem3-gc5-n216-orca025 (piControl)
import xarray as xr
import intake
from copy import deepcopy as copy
import pandas as pd
import hvplot.pandas
import hvplot
import datetime
now = datetime.datetime.now()
#example must be defined
#version must be a list
phase1_simulations={
"ifs-fesom2-sr":[
dict(
experiment="eerie-spinup-1950",
version=["v20240304"],
example="ocean.native.daily"
),
dict(
experiment="eerie-control-1950",
version=["v20240304"],
example="ocean.native.daily"
),
dict(
experiment="hist-1950",
version=["v20240304"],
example="ocean.gr025.2D_daily_avg_1950-2014"
),
],
"icon-esm-er":[
dict(
experiment="eerie-spinup-1950",
version=["v20240618"],
example="ocean.native.2d_daily_mean"
),
dict(
experiment="eerie-control-1950",
version=["v20240618"],
example="atmos.native.2d_daily_mean"
),
dict(
experiment="hist-1950",
version=["v20240618"],
example="ocean.native.2d_daily_mean"
),
],
"ifs-amip-tco1279":[
dict(
experiment="hist",
version=["v20240901"],
example="atmos.gr025.2D_monthly"
),
dict(
experiment="hist-c-0-a-lr20",
version=["v20240901"],
example="atmos.gr025.2D_monthly"
)
],
"ifs-amip-tco399":[
dict(
experiment="hist",
version=["v20240901"],
example="atmos.gr025.2D_daily"
),
dict(
experiment="hist-c-0-a-lr20",
version=["v20240901"],
example="atmos.gr025.2D_daily"
),
dict(
experiment="hist-c-lr20-a-0",
version=["v20240901"],
example="atmos.gr025.2D_daily"
),
],
"ifs-nemo":[
dict(
experiment="eerie-spinup-1950",
version=["v20241010"],
example="atmos.gr1x1.monthly"
),
],
"hadgem3-gc5-n640-orca12":[dict(
experiment="eerie-picontrol",
example="atmos.native.atmos_monthly_emon"
)],
"hadgem3-gc5-n216-orca025":[dict(
experiment="eerie-picontrol",
example="atmos.native.atmos_monthly_emon"
)]
}
Statistics#
No of xarray datasets
The number of xarray datasets per phase 1 simulation is equivalent to the sum of the entries in the intake catalogue for the specific simulation.
No of variables
The number off variables per phase 1 simulation is computed by summing up
len(ds.data_vars)
for each xarray datasetds
of a simulation. That means, “variables” are a combination of aggregation and variable name, similar to the definition of a CMOR variable. 2m Temperature can be accounted multiple times if it is written for multiple datasets i.e. multiple aggregations.Size in memory [TB]
The size in memory per phase 1 simulation is computed by summing up
ds.size
for each xarray datasetds
of a simulation. This does not reflect the actual volume on disk because the datasets can be stored in a compressed form.Start simulation year
The start simulation year is the first year of the example dataset
End simulation year
The start simulation year is the end year of the example dataset
eerie_cat=intake.open_catalog(
#"https://raw.githubusercontent.com/eerie-project/intake_catalogues/main/dkrz/disk/model-output/main.yaml"
"/work/bm1344/DKRZ/intake_catalogues/dkrz/disk/model-output/main.yaml"
)
def find_data_sources(catalog,name=None):
newname='.'.join(
[ a
for a in [name, catalog.name]
if a
]
)
data_sources = []
for key, entry in catalog.items():
if isinstance(entry, intake.catalog.Catalog):
if newname == "main":
newname = None
# If the entry is a subcatalog, recursively search it
data_sources.extend(find_data_sources(entry, newname))
elif isinstance(entry, intake.source.base.DataSource):
if newname:
data_sources.append(newname+"."+key)
else:
data_sources.append(key)
return data_sources
%%capture
sizedict={}
dflist=[]
for source_id,experiments in phase1_simulations.items():
print(source_id)
cat_source=eerie_cat[source_id]
dslist = find_data_sources(cat_source)
for idx,experiment in enumerate(experiments):
sdict=dict(source=source_id)
datasets={}
exp_id=experiment["experiment"]
sdict["experiment"]=exp_id
version=experiment.get("version",None)
sdict["version"]="latest"
print(exp_id)
if version:
sdict["version"]=version[-1]
for vid in version:
for ds in dslist:
if not '.'.join([exp_id,vid]) in ds:
continue
print(ds)
try:
datasets[ds]=cat_source['.'.join(ds.split('.')[1:])].to_dask()
except:
print("Could not load "+'.'.join(ds.split('.')[1:]))
else:
for ds in dslist:
if not exp_id in ds:
continue
print(ds)
datasets[ds]=cat_source['.'.join(ds.split('.')[1:])].to_dask()
phase1_simulations[source_id][idx]["datasets"]=copy(datasets)
#
#assume datasets is the latest version
#
no_of_variables=0
size=0
sdict["No of xarray datasets"]=len(datasets)
exds=None
for name,ds in datasets.items():
if "icon" in source_id and "spinup" in exp_id:
ds=ds.where(ds.time.dt.year.isin(range(1950,1991)),drop=True)
if experiment["example"] in name:
exds=ds
if version:
if sdict["version"] in name:
size+=ds.nbytes
sizedict[name]=size
no_of_variables+=len(ds.data_vars)
else:
size+=ds.nbytes
sizedict[name]=size
no_of_variables+=len(ds.data_vars)
sdict["No of variables"]=no_of_variables
sdict["Size in memory [TB]"]=size/1024**4
try:
years=exds["time"].groupby("time.year").groups
except:
display(exds)
sdict["Start simulation year"]=list(years.keys())[0]
sdict["End simulation year"]=list(years.keys())[-1]
dflist.append(copy(sdict))
sourcedf=pd.DataFrame(dflist)#.transpose()
#sourcedf.to_csv("statistics.csv")
sourcedf.columns
Index(['source', 'experiment', 'version', 'No of xarray datasets',
'No of variables', 'Size in memory [TB]', 'Start simulation year',
'End simulation year'],
dtype='object')
sy=sourcedf[["source","experiment","Size in memory [TB]","No of xarray datasets","No of variables"]]
sy["Simulation years"]=sourcedf["End simulation year"]-sourcedf["Start simulation year"]+1
sy=sy.set_index(["source","experiment"]).sort_values(by="Size in memory [TB]")
/tmp/ipykernel_636883/1437414734.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
sy["Simulation years"]=sourcedf["End simulation year"]-sourcedf["Start simulation year"]+1
#cl="Size in memory [TB]"
yl="Size in memory [TB]"
title=yl+" "+now.strftime("%Y-%m-%d")
plot=sy[yl].hvplot.barh(
stacked=True, height=500, legend='bottom_right',grid=True,cmap="bmy",title=title
).opts(fontsize={
'title': 15,
'labels': 14,
'yticks':12
})
plot
#cl="Size in memory [TB]"
yl="Simulation years"
yl="Simulation years"
title=yl+" "+now.strftime("%Y-%m-%d")
plot=sy[yl].hvplot.barh(
stacked=True, height=500, legend='bottom_right',grid=True,cmap="bmy",title=title
).opts(fontsize={
'title': 16,
'labels': 16,
'yticks':12
})
plot
#hvplot.save(plot,"statistics_years.html")