Getting file names from intake with a command-line python script

This is a python tool for the command line that will yield file names containing desired variables, and other information

#!/sw/spack-levante/mambaforge-4.11.0-0-Linux-x86_64-sobz6z/bin/python

import argparse


def get_from_cat(catalog, field, searchdict=None):
    """Call this to get all values of a field in the catalog as a sorted list"""
    if searchdict is not None and len(searchdict) > 0:
        cat = catalog.search(**searchdict)
    else:
        cat = catalog
    return sorted(cat.unique(field)[field]["values"])


def parse_args():
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
    parser.description = """List files for a given variable and simulation."""
    parser.epilog = """Note that regular expressions can be used, but need to follow the "full" regex-syntax.
    "20-02" will search for exactly "2020-02" (no regex used).
    "20-02*" will search for anything *containing* "20-0" and an arbitrary number of "2"s following that, so "2020-03" also matches.
    "20-02.*" will search for anything *containing* "20-02" .
    "^20-02.*" will search for anything *starting with* "20-02" .
    "2020-.*-03T" will search for anything *containing*  "2020", followed by an arbitrary number of characters followed by "03T".
    
Use "" to leave variable_id or simulation_id empty.

Use 
find_files "" "" -f "experiment_id,source_id" 
to get a list of all experiments and participating models available
    """
    parser.add_argument("variable_id")
    parser.add_argument("simulation_id")
    # optional arguments (those with default are added below)
    optional_args = [
        "project",
        "institution_id",
        "source_id",
        "experiment_id",
        "realm",
        "frequency",
        "time_reduction",
        "grid_label",
        "level_type",
        "time_min",
        "time_max",
        "grid_id",
        "format",
        "uri",
    ]
    for x in optional_args:
        parser.add_argument(f"--{x}", action="append")
    parser.add_argument(
        "-c", "--catalog_file", default="/work/ka1081/Catalogs/dyamond-nextgems.json"
    )
    parser.add_argument(
        "-f",
        "--print_format",
        default="uri",
        help="Comma separated list of columns to be plotted. e.g. 'variable_id,source_id'",
    )
    parser.add_argument(
        "--full", action="store_true", help="Print full dataset information"
    )

    pruned_dict = {k: v for k, v in vars(parser.parse_args()).items() if v is not None}
    if pruned_dict["full"]:
        pruned_dict["full"] = "True"
    else:
        del pruned_dict["full"]
    pruned_dict = {k: v for k, v in pruned_dict.items() if len(v) > 0}
    for k, v in pruned_dict.items():
        if len(v) == 1:
            pruned_dict[k] = v[0]
    return pruned_dict


if __name__ == "__main__":
    options = parse_args()
    import intake

    catalog_file = options["catalog_file"]
    del options["catalog_file"]
    cat = intake.open_esm_datastore(catalog_file)
    fmt = options.get("print_format")
    del options["print_format"]
    try:
        if options.get("full", False):
            del options["full"]
            import pandas as pd

            pd.set_option("display.max_columns", None)
            pd.set_option("max_colwidth", None)
            pd.set_option("display.width", 10000)
            print(cat.search(**options).df)
        else:
            cols = fmt.split(",")
            if len(cols) == 1:
                matches = get_from_cat(cat, fmt, options)
                [print(x) for x in matches]
            else:
                import pandas as pd

                pd.set_option("display.max_columns", None)
                pd.set_option("max_colwidth", None)
                pd.set_option("display.width", 10000)
                pd.set_option("display.max_rows", None)
                hitlist = cat
                if len(options):
                    hitlist = hitlist.search(**options)
                hitlist = (
                    hitlist.df[cols]
                    .drop_duplicates()
                    .sort_values(cols)
                    .to_string(index=False)
                )
                print(hitlist)
    except ValueError:
        import sys

        print(
            "\nERROR: Could not find any matches for your query ",
            options,
            "in catalog ",
            catalog_file,
            file=sys.stderr,
        )
        sys.exit(1)

A usage example would be (saving the code as find_files, and making it executable with chmod a+x find_files)

cdo  -timmean -select,name=tas [ $(./find_files tas dpp0066 --time_min="2020-02-0.*") ] /my/output/file
cdo(1) select: Process started
cdo(1) select: 100%
cdo(1) select: Processed 9059696640 values from 333 variables over 432 timesteps.
cdo    timmean: Processed 9059696640 values from 1 variable over 432 timesteps [46.86s 524MB].

Use --full to get detailed information:

./find_files.py ua dpp0066 --time_min='2020-02-02.*' --full
           variable_id   project institution_id source_id experiment_id  \
0  (ua, va, vor, gpsm)  NextGEMS          MPI-M  ICON-ESM  Cycle2-alpha
1             (ua, va)  NextGEMS          MPI-M  ICON-ESM  Cycle2-alpha

  simulation_id realm frequency time_reduction grid_label level_type  \
0       dpp0066   atm     3hour           inst         gn         pl
1       dpp0066   atm     3hour           mean         gn         ml

                  time_min                 time_max          grid_id  format  \
0  2020-02-02T00:00:00.000  2020-02-02T23:59:20.000  not implemented  netcdf
1  2020-02-02T00:00:00.000  2020-02-02T23:59:20.000  not implemented  netcdf

                                                                                  uri
0  /work/mh0287/m300083/experiments/dpp0066/dpp0066_atm_2d_850_pl_20200202T000000Z.nc
1    /work/mh0287/m300083/experiments/dpp0066/dpp0066_atm_3d_2_ml_20200202T000000Z.nc

Or with --format=... to get specific colums:

./find_files to dpp0066  -f 'experiment_id,simulation_id,frequency'
experiment_id simulation_id frequency
 Cycle2-alpha       dpp0066      1day
 Cycle2-alpha       dpp0066     1hour
 Cycle2-alpha       dpp0066     3hour

find_files to dpp0066  -f 'experiment_id,simulation_id,frequency,uri' --time_min='2020-02-22.*'
experiment_id simulation_id frequency                                                                                   uri
 Cycle2-alpha       dpp0066      1day       /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3d_P1D_20200222T000000Z.nc
 Cycle2-alpha       dpp0066      1day    /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3dlev_P1D_20200222T000000Z.nc
 Cycle2-alpha       dpp0066     1hour   /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_2dopt_PT1H_20200222T000000Z.nc
 Cycle2-alpha       dpp0066     3hour /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3du200m_PT3H_20200222T000000Z.nc

# now we assume that going for the daily files is going to cause trouble in further processing because two filesets have the data.

find_files to dpp0066  -f 'experiment_id,simulation_id,frequency,uri' --time_min='2020-02-22.*' --uri='.*3dlev.*'
experiment_id simulation_id frequency                                                                                uri
 Cycle2-alpha       dpp0066      1day /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3dlev_P1D_20200222T000000Z.nc

Call it as

./find_files -h

to get the full help message.