Listing the entries in an intake catalog#

(or doing other manipulations on them)

[1]:
import intake
import logging
from collections.abc import Iterable
[2]:
def warn_esm_cat(cat, child, position):
    logging.warning(
        f"skipping {'.'.join(position)}.{child}, as it seems to be an intake-esm catalog"
    )


def traverse_tree(
    cat,
    subcat_callback,
    entry_callback,
    esm_cat_callback=warn_esm_cat,
    levels=0,
    position=list(),
):
    """Traverses an intake tree and call a function on everything it finds in it.
    subcat_callback is called on anything iterable (should be sub-catalogs)
    entry_callback is called on anything not iterable (should be datasets)
    esm_cat_callback will be called on intake_esm_catalogs. Defaults to a warning message, as loading them can consume a lot of memory and time.
    """
    if levels and (levels - 1 < len(position)):
        return
    for child in list(cat):
        logging.debug(f"processing {child}")
        if detect_esm_cat(cat, child, position):
            esm_cat_callback(cat, child, position)
            continue
        try:
            cat[child]
        except FileNotFoundError as missing:
            logging.critical(
                f"Error processing {'.'.join(position)}.{child}: File not found: {missing}"
            )
            continue

        if isinstance(cat[child], Iterable):
            subcat_callback(cat, child, position)
            traverse_tree(
                cat[child],
                subcat_callback,
                entry_callback,
                levels=levels,
                position=position + [child],
            )
        else:
            entry_callback(cat, child, position)


def detect_esm_cat(cat, child, position):
    try:
        if "esm_datastore" in str(cat._entries[child]._driver):
            return True
    except Exception as e:
        logging.error(
            f"Can't really decide the type of \n{position}{child}\nran into {e}"
        )
    return False


def print_tree(cat, levels=0):
    def printer(cat, child, position, appendix=""):
        try:
            parameters = [
                p["name"] for p in cat[child].describe().get("user_parameters", [])
            ]
            if len(parameters) > 0:
                parameter_str = f"({', '.join(parameters)})"
            else:
                parameter_str = ""
        except Exception as e:
            if str(e) == "Source was not made from a catalog entry":
                parameter_str = ""
            else:
                logging.warning(str(e))
        print(f"{'  '*len(position)}{child} {parameter_str} {appendix}")

    def subcat_printer(*args, **kwargs):
        return printer(*args, **kwargs, appendix="🌳")

    traverse_tree(cat, subcat_printer, printer, levels=levels)
[3]:
cat = intake.open_catalog("https://data.nextgems-h2020.eu/catalog.yaml")
[4]:
print_tree(cat, levels=2)
ICON  🌳
  ngc4008 (time, zoom)
  ngc4007 (time, zoom)
  ngc4006 (time, zoom)
  ngc4005 (time, zoom)
  ngc3028 (time, zoom)
  ngc3028_bc_land
  ngc3026_WILL_BE_DELETED (time, zoom)
  HAMOCC  🌳
  ngc3542  🌳
  erc1011  🌳
  erc1017  🌳
IFS  🌳
  IFS_9-FESOM_5-production  🌳
  IFS_4.4-FESOM_5-cycle3  🌳
  IFS_9-FESOM_5-cycle3  🌳
  IFS_9-NEMO_25-cycle3  🌳
  IFS_28-NEMO_25-cycle3  🌳
  IFS_4.4-FESOM_5-cycle3-nofastdata  🌳
  IFS_4.4-FESOM_5-cycle3-fastdata  🌳
  IFS_grids  🌳
FESOM  🌳
  IFS_4.4-FESOM_5-cycle3  🌳
  IFS_28-FESOM_25-cycle3  🌳
  IFS_9-FESOM_5-cycle3  🌳
  FESOM_13_tropo_age_interpolated  🌳