Define the encoding of a Zarr store#

import intake
import numcodecs
import numpy as np
import xarray as xr


cat = intake.open_catalog("https://tcodata.mpimet.mpg.de/internal.yaml")
ds = cat.HIFS(datetime="2024-09-01").to_dask()
ds = ds.sel(time=slice("2024-09-01", "2024-09-01 18:00"))
ds

/usr/lib/python3.12/site-packages/intake_xarray/base.py:21: FutureWarning: The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.
  'dims': dict(self._ds.dims),

<xarray.Dataset> Size: 694MB
Dimensions:  (time: 6, cell: 196608, crs: 1, level: 13)
Coordinates:
  * crs      (crs) float64 8B nan
  * level    (level) int64 104B 50 100 150 200 250 300 ... 600 700 850 925 1000
  * time     (time) datetime64[ns] 48B 2024-09-01T03:00:00 ... 2024-09-01T18:...
Dimensions without coordinates: cell
Data variables: (12/39)
    100u     (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    100v     (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    10u      (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    10v      (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    2d       (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    2t       (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    ...       ...
    tp       (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    ttr      (time, cell) float32 5MB dask.array<chunksize=(6, 16384), meta=np.ndarray>
    u        (time, level, cell) float32 61MB dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>
    v        (time, level, cell) float32 61MB dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>
    vo       (time, level, cell) float32 61MB dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>
    w        (time, level, cell) float32 61MB dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

xarray.Dataset

Dimensions:
- time: 6
- cell: 196608
- crs: 1
- level: 13

Coordinates: (3)

crs
(crs)
float64
nan
grid_mapping_name :
healpix
healpix_nside :
128
healpix_order :
nest
```
array([nan])
```
level
(level)
int64
50 100 150 200 ... 700 850 925 1000
axis :
Z
long_name :
Air pressure at model level
positive :
down
standard_name :
air_pressure
units :
hPa
```
array([  50,  100,  150,  200,  250,  300,  400,  500,  600,  700,  850,  925,
       1000])
```

time

(time)

datetime64[ns]

2024-09-01T03:00:00 ... 2024-09-...

axis :: T

array(['2024-09-01T03:00:00.000000000', '2024-09-01T06:00:00.000000000',
       '2024-09-01T09:00:00.000000000', '2024-09-01T12:00:00.000000000',
       '2024-09-01T15:00:00.000000000', '2024-09-01T18:00:00.000000000'],
      dtype='datetime64[ns]')

Data variables: (39)

100u

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: 100 metre U wind component
standard_name :: eastward_wind
type :: forecast
units :: m s**-1

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

100v

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: 100 metre V wind component
standard_name :: northward_wind
type :: forecast
units :: m s**-1

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

10u

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: 10 metre U wind component
standard_name :: eastward_wind
type :: forecast
units :: m s**-1

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

10v

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: 10 metre V wind component
standard_name :: northward_wind
type :: forecast
units :: m s**-1

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

2d

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: 2 metre dewpoint temperature
standard_name :
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

2t

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: 2 metre temperature
standard_name :: air_temperature
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

asn

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Snow albedo
standard_name :
type :: forecast
units :: (0 - 1)

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

cape

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: entireAtmosphere
long_name :: Convective available potential energy
standard_name :
type :: forecast
units :: J kg**-1

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

d

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: Divergence
standard_name :: divergence_of_wind
type :: forecast
units :: s**-1

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

gh

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: Geopotential height
standard_name :: geopotential_height
type :: forecast
units :: gpm

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

lsm

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Land-sea mask
standard_name :: land_binary_mask
type :: forecast
units :: (0 - 1)

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

mn2t6

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: Minimum temperature at 2 metres in the last 6 hours
standard_name :: air_temperature
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

msl

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: meanSea
long_name :: Mean sea level pressure
standard_name :: air_pressure_at_mean_sea_level
type :: forecast
units :: Pa

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

mx2t6

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: heightAboveGround
long_name :: Maximum temperature at 2 metres in the last 6 hours
standard_name :: air_temperature
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

q

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: Specific humidity
standard_name :: specific_humidity
type :: forecast
units :: kg kg**-1

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

r

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: Relative humidity
standard_name :: relative_humidity
type :: forecast
units :: %

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

ro

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Runoff
standard_name :
type :: forecast
units :: m

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

skt

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Skin temperature
standard_name :
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

sp

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Surface pressure
standard_name :: surface_air_pressure
type :: forecast
units :: Pa

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

ssr

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Surface net short-wave (solar) radiation
standard_name :: surface_net_downward_shortwave_flux
type :: forecast
units :: J m**-2

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

ssrd

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Surface short-wave (solar) radiation downwards
standard_name :: surface_downwelling_shortwave_flux_in_air
type :: forecast
units :: J m**-2

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

st

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Soil temperature
standard_name :
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

stl2

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Soil temperature level 2
standard_name :
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

stl3

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Soil temperature level 3
standard_name :
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

stl4

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Soil temperature level 4
standard_name :
type :: forecast
units :: K

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

str

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Surface net long-wave (thermal) radiation
standard_name :: surface_net_upward_longwave_flux
type :: forecast
units :: J m**-2

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

strd

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Surface long-wave (thermal) radiation downwards
standard_name :
type :: forecast
units :: J m**-2

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

swvl1

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Volumetric soil water layer 1
standard_name :
type :: forecast
units :: m**3 m**-3

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

swvl2

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Volumetric soil water layer 2
standard_name :
type :: forecast
units :: m**3 m**-3

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

swvl3

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Volumetric soil water layer 3
standard_name :
type :: forecast
units :: m**3 m**-3

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

swvl4

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: depthBelowLandLayer
long_name :: Volumetric soil water layer 4
standard_name :
type :: forecast
units :: m**3 m**-3

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

t

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: Temperature
standard_name :: air_temperature
type :: forecast
units :: K

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

tcwv

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: entireAtmosphere
long_name :: Total column vertically-integrated water vapour
standard_name :: lwe_thickness_of_atmosphere_mass_content_of_water_vapor
type :: forecast
units :: kg m**-2

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

tp

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: surface
long_name :: Total precipitation
standard_name :
type :: forecast
units :: m

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

ttr

(time, cell)

float32

dask.array<chunksize=(6, 16384), meta=np.ndarray>

levtype :: nominalTop
long_name :: Top net long-wave (thermal) radiation
standard_name :: toa_outgoing_longwave_flux
type :: forecast
units :: J m**-2

	Array	Chunk
Bytes	4.50 MiB	384.00 kiB
Shape	(6, 196608)	(6, 16384)
Dask graph	12 chunks in 3 graph layers
Data type	float32 numpy.ndarray

u

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: U component of wind
standard_name :: eastward_wind
type :: forecast
units :: m s**-1

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

v

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: V component of wind
standard_name :: northward_wind
type :: forecast
units :: m s**-1

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

vo

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: Vorticity (relative)
standard_name :: atmosphere_relative_vorticity
type :: forecast
units :: s**-1

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

w

(time, level, cell)

float32

dask.array<chunksize=(6, 1, 16384), meta=np.ndarray>

levtype :: isobaricInhPa
long_name :: Vertical velocity
standard_name :: lagrangian_tendency_of_air_pressure
type :: forecast
units :: Pa s**-1

	Array	Chunk
Bytes	58.50 MiB	384.00 kiB
Shape	(6, 13, 196608)	(6, 1, 16384)
Dask graph	156 chunks in 3 graph layers
Data type	float32 numpy.ndarray

Indexes: (3)

crs

PandasIndex

PandasIndex(Index([nan], dtype='float64', name='crs'))

level

PandasIndex

PandasIndex(Index([50, 100, 150, 200, 250, 300, 400, 500, 600, 700, 850, 925, 1000], dtype='int64', name='level'))

time

PandasIndex

PandasIndex(DatetimeIndex(['2024-09-01 03:00:00', '2024-09-01 06:00:00',
               '2024-09-01 09:00:00', '2024-09-01 12:00:00',
               '2024-09-01 15:00:00', '2024-09-01 18:00:00'],
              dtype='datetime64[ns]', name='time', freq=None))

Attributes: (0)

Data types#

Explicitly set the output datatype to single precision float for all float subtypes.

def get_dtype(da):
    if np.issubdtype(da.dtype, np.floating):
        return "float32"
    else:
        return da.dtype


get_dtype(ds["tcwv"])

'float32'

Chunking#

We define multi-dimensional chunks for me efficient data access. We aim at a chunk size of about 1 MB which is a reasonable choice when accessing data via HTTP. Depending on the total size of your dataset, this chunksize may results in millions (!) of individual files, which might cause problems on some file systems.

def get_chunks(dimensions):
    if "level" in dimensions:
        chunks = {
            "time": 6,
            "cell": 4**6,
            "level": 4,
        }
    else:
        chunks = {
            "time": 6,
            "cell": 4**7,
        }

    return tuple((chunks[d] for d in dimensions))


get_chunks(ds["tcwv"].dims)

(6, 16384)

Compression#

We compress all variables using Zstd into a blosc container. Increasing the compression level from its default value of 5 will usually result in a slightly better compression ratio without adding significant overhead.

def get_compressor():
    return numcodecs.Blosc("zstd", clevel=6)


get_compressor()

Blosc(cname='zstd', clevel=6, shuffle=SHUFFLE, blocksize=0)

Plug and play#

Finally, we can put the pieces together to define an encoding for the whole dataset. The following function loops over all variables (that are not a dimension) and creates an encoding dictionary.

def get_encoding(dataset):
    return {
        var: {
            "compressor": get_compressor(),
            "dtype": get_dtype(dataset[var]),
            "chunks": get_chunks(dataset[var].dims),
        }
        for var in dataset.variables
        if var not in dataset.dims
    }


get_encoding(ds[["t", "2t"]])

{'t': {'compressor': Blosc(cname='zstd', clevel=6, shuffle=SHUFFLE, blocksize=0),
  'dtype': 'float32',
  'chunks': (6, 4, 4096)},
 '2t': {'compressor': Blosc(cname='zstd', clevel=6, shuffle=SHUFFLE, blocksize=0),
  'dtype': 'float32',
  'chunks': (6, 16384)}}

The encoding dictionary can be passed to the to_zarr() function. When using dask, make sure that the dask chunks match the selected Zarr chunks. Otherwise the Zarr library will throw an error to prevent multiple dask chunks from writing to the same chunk on disk.

ds.chunk({"time": 24, "level": 4, "cell": -1}).to_zarr(
    "test_dataset.zarr", encoding=get_encoding(ds)
)

<xarray.backends.zarr.ZarrStore at 0x7f1d226f2020>