Getting file names from intake with a command-line python script¶
This is a python tool for the command line that will yield file names containing desired variables, and other information
#!/sw/spack-levante/mambaforge-4.11.0-0-Linux-x86_64-sobz6z/bin/python
import argparse
def get_from_cat(catalog, field, searchdict=None):
"""Call this to get all values of a field in the catalog as a sorted list"""
if searchdict is not None and len(searchdict) > 0:
cat = catalog.search(**searchdict)
else:
cat = catalog
return sorted(cat.unique(field)[field]["values"])
def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.description = """List files for a given variable and simulation."""
parser.epilog = """Note that regular expressions can be used, but need to follow the "full" regex-syntax.
"20-02" will search for exactly "2020-02" (no regex used).
"20-02*" will search for anything *containing* "20-0" and an arbitrary number of "2"s following that, so "2020-03" also matches.
"20-02.*" will search for anything *containing* "20-02" .
"^20-02.*" will search for anything *starting with* "20-02" .
"2020-.*-03T" will search for anything *containing* "2020", followed by an arbitrary number of characters followed by "03T".
Use "" to leave variable_id or simulation_id empty.
Use
find_files "" "" -f "experiment_id,source_id"
to get a list of all experiments and participating models available
"""
parser.add_argument("variable_id")
parser.add_argument("simulation_id")
# optional arguments (those with default are added below)
optional_args = [
"project",
"institution_id",
"source_id",
"experiment_id",
"realm",
"frequency",
"time_reduction",
"grid_label",
"level_type",
"time_min",
"time_max",
"grid_id",
"format",
"uri",
]
for x in optional_args:
parser.add_argument(f"--{x}", action="append")
parser.add_argument(
"-c", "--catalog_file", default="/work/ka1081/Catalogs/dyamond-nextgems.json"
)
parser.add_argument(
"-f",
"--print_format",
default="uri",
help="Comma separated list of columns to be plotted. e.g. 'variable_id,source_id'",
)
parser.add_argument(
"--full", action="store_true", help="Print full dataset information"
)
pruned_dict = {k: v for k, v in vars(parser.parse_args()).items() if v is not None}
if pruned_dict["full"]:
pruned_dict["full"] = "True"
else:
del pruned_dict["full"]
pruned_dict = {k: v for k, v in pruned_dict.items() if len(v) > 0}
for k, v in pruned_dict.items():
if len(v) == 1:
pruned_dict[k] = v[0]
return pruned_dict
if __name__ == "__main__":
options = parse_args()
import intake
catalog_file = options["catalog_file"]
del options["catalog_file"]
cat = intake.open_esm_datastore(catalog_file)
fmt = options.get("print_format")
del options["print_format"]
try:
if options.get("full", False):
del options["full"]
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("max_colwidth", None)
pd.set_option("display.width", 10000)
print(cat.search(**options).df)
else:
cols = fmt.split(",")
if len(cols) == 1:
matches = get_from_cat(cat, fmt, options)
[print(x) for x in matches]
else:
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("max_colwidth", None)
pd.set_option("display.width", 10000)
pd.set_option("display.max_rows", None)
hitlist = cat
if len(options):
hitlist = hitlist.search(**options)
hitlist = (
hitlist.df[cols]
.drop_duplicates()
.sort_values(cols)
.to_string(index=False)
)
print(hitlist)
except ValueError:
import sys
print(
"\nERROR: Could not find any matches for your query ",
options,
"in catalog ",
catalog_file,
file=sys.stderr,
)
sys.exit(1)
A usage example would be (saving the code as find_files
, and making it executable with chmod a+x find_files
)
cdo -timmean -select,name=tas [ $(./find_files tas dpp0066 --time_min="2020-02-0.*") ] /my/output/file
cdo(1) select: Process started
cdo(1) select: 100%
cdo(1) select: Processed 9059696640 values from 333 variables over 432 timesteps.
cdo timmean: Processed 9059696640 values from 1 variable over 432 timesteps [46.86s 524MB].
Use --full
to get detailed information:
./find_files.py ua dpp0066 --time_min='2020-02-02.*' --full
variable_id project institution_id source_id experiment_id \
0 (ua, va, vor, gpsm) NextGEMS MPI-M ICON-ESM Cycle2-alpha
1 (ua, va) NextGEMS MPI-M ICON-ESM Cycle2-alpha
simulation_id realm frequency time_reduction grid_label level_type \
0 dpp0066 atm 3hour inst gn pl
1 dpp0066 atm 3hour mean gn ml
time_min time_max grid_id format \
0 2020-02-02T00:00:00.000 2020-02-02T23:59:20.000 not implemented netcdf
1 2020-02-02T00:00:00.000 2020-02-02T23:59:20.000 not implemented netcdf
uri
0 /work/mh0287/m300083/experiments/dpp0066/dpp0066_atm_2d_850_pl_20200202T000000Z.nc
1 /work/mh0287/m300083/experiments/dpp0066/dpp0066_atm_3d_2_ml_20200202T000000Z.nc
Or with --format=...
to get specific colums:
./find_files to dpp0066 -f 'experiment_id,simulation_id,frequency'
experiment_id simulation_id frequency
Cycle2-alpha dpp0066 1day
Cycle2-alpha dpp0066 1hour
Cycle2-alpha dpp0066 3hour
find_files to dpp0066 -f 'experiment_id,simulation_id,frequency,uri' --time_min='2020-02-22.*'
experiment_id simulation_id frequency uri
Cycle2-alpha dpp0066 1day /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3d_P1D_20200222T000000Z.nc
Cycle2-alpha dpp0066 1day /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3dlev_P1D_20200222T000000Z.nc
Cycle2-alpha dpp0066 1hour /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_2dopt_PT1H_20200222T000000Z.nc
Cycle2-alpha dpp0066 3hour /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3du200m_PT3H_20200222T000000Z.nc
# now we assume that going for the daily files is going to cause trouble in further processing because two filesets have the data.
find_files to dpp0066 -f 'experiment_id,simulation_id,frequency,uri' --time_min='2020-02-22.*' --uri='.*3dlev.*'
experiment_id simulation_id frequency uri
Cycle2-alpha dpp0066 1day /work/mh0287/m300083/experiments/dpp0066/dpp0066_oce_3dlev_P1D_20200222T000000Z.nc
Call it as
./find_files -h
to get the full help message.