Phase 1#

This notebook provides an overview and generates and plots statistics about the EERIE Phase 1 production simulation data available at DKRZ’s High Performance Computer Levante. We browse through the EERIE intake catalog, search for phase 1 models, experiments and versions and collect information from a datasets:

Other phase 1 sources:

  • The eerie.cloud provides a browsable interface using a EERIE stac collection of the phase 1 simulations. The STAC collection is a mirror of the intake catalog.

  • The recent progress of the simulations is displayed here

Phase 1 simulations are a set of combinations of ESM and Experiments. For each entry in this set, we have exactly one version that is considered the phase one simulation.

  • IFS-FESOM2-SR: (eerie-spinup-1950, eerie-control-1950, hist-1950)

  • ICON-ESM-ER: (eerie-spinup-1950, eerie-control-1950, hist-1950)

  • IFS-AMIP-TCO399: (hist, hist-c-0-a-lr20, hist-c-lr20-a-0)

  • IFS-AMIP-TCO1279: (hist, hist-c-0-a-lr20)

  • IFS-NEMO: (eerie-spinup-1950)

  • hadgem3-gc5-n640-orca12 (piControl)

  • hadgem3-gc5-n216-orca025 (piControl)

import xarray as xr
import intake
from copy import deepcopy as copy
import pandas as pd
import hvplot.pandas
import hvplot
import datetime
now = datetime.datetime.now()
#example must be defined
#version must be a list
phase1_simulations={
    "ifs-fesom2-sr":[
        dict(
            experiment="eerie-spinup-1950",
            version=["v20240304"],
            example="ocean.native.daily"
        ),
        dict(
            experiment="eerie-control-1950",
            version=["v20240304"],
            example="ocean.native.daily"
        ),
        dict(
        experiment="hist-1950",
        version=["v20240304"],
        example="ocean.gr025.2D_daily_avg_1950-2014"
    ),
    ],
    "icon-esm-er":[
        dict(
            experiment="eerie-spinup-1950",
            version=["v20240618"],
            example="ocean.native.2d_daily_mean"
        ),
        dict(
            experiment="eerie-control-1950",
            version=["v20240618"],
            example="atmos.native.2d_daily_mean"
        ),        
        dict(
            experiment="hist-1950",
            version=["v20240618"],
            example="ocean.native.2d_daily_mean"
        ),
    ],
    "ifs-amip-tco1279":[
        dict(
            experiment="hist",
            version=["v20240901"],
            example="atmos.gr025.2D_monthly"
            
        ),
        dict(
            experiment="hist-c-0-a-lr20",
            version=["v20240901"],
            example="atmos.gr025.2D_monthly"
        )
    ],
    "ifs-amip-tco399":[
        dict(
            experiment="hist",
            version=["v20240901"],
            example="atmos.gr025.2D_daily"            
        ),
        dict(
            experiment="hist-c-0-a-lr20",
            version=["v20240901"],
            example="atmos.gr025.2D_daily"
        ),
        dict(
            experiment="hist-c-lr20-a-0",
            version=["v20240901"],
            example="atmos.gr025.2D_daily"
        ),
    ],
    "ifs-nemo":[
        dict(
            experiment="eerie-spinup-1950",
            version=["v20241010"],
            example="atmos.gr1x1.monthly"
        ),        
    ],
    "hadgem3-gc5-n640-orca12":[dict(
        experiment="eerie-picontrol",
        example="atmos.native.atmos_monthly_emon"
    )],
    "hadgem3-gc5-n216-orca025":[dict(
        experiment="eerie-picontrol",
        example="atmos.native.atmos_monthly_emon"
    )]
}

Statistics#

  • No of xarray datasets

    The number of xarray datasets per phase 1 simulation is equivalent to the sum of the entries in the intake catalogue for the specific simulation.

  • No of variables

    The number off variables per phase 1 simulation is computed by summing up len(ds.data_vars) for each xarray dataset ds of a simulation. That means, “variables” are a combination of aggregation and variable name, similar to the definition of a CMOR variable. 2m Temperature can be accounted multiple times if it is written for multiple datasets i.e. multiple aggregations.

  • Size in memory [TB]

    The size in memory per phase 1 simulation is computed by summing up ds.size for each xarray dataset ds of a simulation. This does not reflect the actual volume on disk because the datasets can be stored in a compressed form.

  • Start simulation year

    The start simulation year is the first year of the example dataset

  • End simulation year

    The start simulation year is the end year of the example dataset

eerie_cat=intake.open_catalog(
    #"https://raw.githubusercontent.com/eerie-project/intake_catalogues/main/dkrz/disk/model-output/main.yaml"
    "/work/bm1344/DKRZ/intake_catalogues/dkrz/disk/model-output/main.yaml"
)
def find_data_sources(catalog,name=None):
    newname='.'.join(
        [ a 
         for a in [name, catalog.name]
         if a
        ]
    )
    data_sources = []

    for key, entry in catalog.items():
        if isinstance(entry, intake.catalog.Catalog):
            if newname == "main":
                newname = None
            # If the entry is a subcatalog, recursively search it
            data_sources.extend(find_data_sources(entry, newname))
        elif isinstance(entry, intake.source.base.DataSource):
            if newname:
                data_sources.append(newname+"."+key)
            else:
                data_sources.append(key)

    return data_sources
%%capture
sizedict={}
dflist=[]
for source_id,experiments in phase1_simulations.items():
    print(source_id)
    cat_source=eerie_cat[source_id]
    dslist = find_data_sources(cat_source)
    for idx,experiment in enumerate(experiments):
        sdict=dict(source=source_id)
        datasets={}        
        exp_id=experiment["experiment"]        
        sdict["experiment"]=exp_id
        version=experiment.get("version",None)
        sdict["version"]="latest"
        print(exp_id)
        if version:
            sdict["version"]=version[-1]
            for vid in version:
                for ds in dslist:
                    if not '.'.join([exp_id,vid]) in ds:
                        continue
                    print(ds)
                    try:
                        datasets[ds]=cat_source['.'.join(ds.split('.')[1:])].to_dask()
                    except:
                        print("Could not load "+'.'.join(ds.split('.')[1:]))
        else:
            for ds in dslist:
                if not exp_id in ds:
                    continue
                print(ds)
                datasets[ds]=cat_source['.'.join(ds.split('.')[1:])].to_dask()        
        phase1_simulations[source_id][idx]["datasets"]=copy(datasets)
        #
        #assume datasets is the latest version
        #
        no_of_variables=0
        size=0
        sdict["No of xarray datasets"]=len(datasets)
        exds=None
        for name,ds in datasets.items():
            if "icon" in source_id and "spinup" in exp_id:
                ds=ds.where(ds.time.dt.year.isin(range(1950,1991)),drop=True)
            if experiment["example"] in name:
                exds=ds
            if version:
                if sdict["version"] in name:
                    size+=ds.nbytes
                    sizedict[name]=size
                    no_of_variables+=len(ds.data_vars)
            else:
                size+=ds.nbytes
                sizedict[name]=size
                no_of_variables+=len(ds.data_vars)
        sdict["No of variables"]=no_of_variables
        sdict["Size in memory [TB]"]=size/1024**4
        
        try:
            years=exds["time"].groupby("time.year").groups        
        except:
            display(exds)
        sdict["Start simulation year"]=list(years.keys())[0]
        sdict["End simulation year"]=list(years.keys())[-1]

        dflist.append(copy(sdict))
sourcedf=pd.DataFrame(dflist)#.transpose()
#sourcedf.to_csv("statistics.csv")
sourcedf.columns
Index(['source', 'experiment', 'version', 'No of xarray datasets',
       'No of variables', 'Size in memory [TB]', 'Start simulation year',
       'End simulation year'],
      dtype='object')
sy=sourcedf[["source","experiment","Size in memory [TB]","No of xarray datasets","No of variables"]]
sy["Simulation years"]=sourcedf["End simulation year"]-sourcedf["Start simulation year"]+1
sy=sy.set_index(["source","experiment"]).sort_values(by="Size in memory [TB]")
/tmp/ipykernel_636883/1437414734.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sy["Simulation years"]=sourcedf["End simulation year"]-sourcedf["Start simulation year"]+1
#cl="Size in memory [TB]"
yl="Size in memory [TB]"
title=yl+" "+now.strftime("%Y-%m-%d")
plot=sy[yl].hvplot.barh(
    stacked=True, height=500, legend='bottom_right',grid=True,cmap="bmy",title=title
).opts(fontsize={
    'title': 15, 
    'labels': 14, 
    'yticks':12
})
plot
#cl="Size in memory [TB]"
yl="Simulation years"
yl="Simulation years"
title=yl+" "+now.strftime("%Y-%m-%d")
plot=sy[yl].hvplot.barh(
    stacked=True, height=500, legend='bottom_right',grid=True,cmap="bmy",title=title
).opts(fontsize={
    'title': 16, 
    'labels': 16, 
    'yticks':12
})
plot
#hvplot.save(plot,"statistics_years.html")