Skip to content

input4mips_validation.inference.from_data#

input4mips_validation.inference.from_data #

Inference of metadata from data

VARIABLE_DATASET_CATEGORY_MAP = {'tos': 'SSTsAndSeaIce', 'siconc': 'SSTsAndSeaIce', 'sftof': 'SSTsAndSeaIce', 'mole_fraction_of_carbon_dioxide_in_air': 'GHGConcentrations', 'mole_fraction_of_methane_in_air': 'GHGConcentrations', 'mole_fraction_of_nitrous_oxide_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc116_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc218_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc3110_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc4112_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc5114_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc6116_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc7118_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc318_in_air': 'GHGConcentrations', 'mole_fraction_of_carbon_tetrachloride_in_air': 'GHGConcentrations', 'mole_fraction_of_carbon_tetrafluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc11_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc113_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc114_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc115_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc12_in_air': 'GHGConcentrations', 'mole_fraction_of_dichloromethane_in_air': 'GHGConcentrations', 'mole_fraction_of_methyl_bromide_in_air': 'GHGConcentrations', 'mole_fraction_of_hcc140a_in_air': 'GHGConcentrations', 'mole_fraction_of_methyl_chloride_in_air': 'GHGConcentrations', 'mole_fraction_of_chloroform_in_air': 'GHGConcentrations', 'mole_fraction_of_halon1211_in_air': 'GHGConcentrations', 'mole_fraction_of_halon1301_in_air': 'GHGConcentrations', 'mole_fraction_of_halon2402_in_air': 'GHGConcentrations', 'mole_fraction_of_hcfc141b_in_air': 'GHGConcentrations', 'mole_fraction_of_hcfc142b_in_air': 'GHGConcentrations', 'mole_fraction_of_hcfc22_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc125_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc134a_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc143a_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc152a_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc227ea_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc23_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc236fa_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc245fa_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc32_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc365mfc_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc4310mee_in_air': 'GHGConcentrations', 'mole_fraction_of_nitrogen_trifluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_sulfur_hexafluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_sulfuryl_fluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc11_eq_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc12_eq_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc134a_eq_in_air': 'GHGConcentrations', 'solar_irradiance_per_unit_wavelength': 'solar', 'solar_irradiance': 'solar'} module-attribute #

Mapping from variable names to dataset category

The variable names are generally CF standard names (i.e. can include underscores) rather than CMIP data request names (which are meant to have no underscores or other special characters).

TODO: move this into CVs rather than hard-coding here

VARIABLE_REALM_MAP = {'tos': 'ocean', 'siconc': 'seaIce', 'sftof': 'ocean', 'areacello': 'ocean', 'mole_fraction_of_carbon_dioxide_in_air': 'atmos', 'mole_fraction_of_methane_in_air': 'atmos', 'mole_fraction_of_nitrous_oxide_in_air': 'atmos', 'mole_fraction_of_pfc116_in_air': 'atmos', 'mole_fraction_of_pfc218_in_air': 'atmos', 'mole_fraction_of_pfc3110_in_air': 'atmos', 'mole_fraction_of_pfc4112_in_air': 'atmos', 'mole_fraction_of_pfc5114_in_air': 'atmos', 'mole_fraction_of_pfc6116_in_air': 'atmos', 'mole_fraction_of_pfc7118_in_air': 'atmos', 'mole_fraction_of_pfc318_in_air': 'atmos', 'mole_fraction_of_carbon_tetrachloride_in_air': 'atmos', 'mole_fraction_of_carbon_tetrafluoride_in_air': 'atmos', 'mole_fraction_of_cfc11_in_air': 'atmos', 'mole_fraction_of_cfc113_in_air': 'atmos', 'mole_fraction_of_cfc114_in_air': 'atmos', 'mole_fraction_of_cfc115_in_air': 'atmos', 'mole_fraction_of_cfc12_in_air': 'atmos', 'mole_fraction_of_dichloromethane_in_air': 'atmos', 'mole_fraction_of_methyl_bromide_in_air': 'atmos', 'mole_fraction_of_hcc140a_in_air': 'atmos', 'mole_fraction_of_methyl_chloride_in_air': 'atmos', 'mole_fraction_of_chloroform_in_air': 'atmos', 'mole_fraction_of_halon1211_in_air': 'atmos', 'mole_fraction_of_halon1301_in_air': 'atmos', 'mole_fraction_of_halon2402_in_air': 'atmos', 'mole_fraction_of_hcfc141b_in_air': 'atmos', 'mole_fraction_of_hcfc142b_in_air': 'atmos', 'mole_fraction_of_hcfc22_in_air': 'atmos', 'mole_fraction_of_hfc125_in_air': 'atmos', 'mole_fraction_of_hfc134a_in_air': 'atmos', 'mole_fraction_of_hfc143a_in_air': 'atmos', 'mole_fraction_of_hfc152a_in_air': 'atmos', 'mole_fraction_of_hfc227ea_in_air': 'atmos', 'mole_fraction_of_hfc23_in_air': 'atmos', 'mole_fraction_of_hfc236fa_in_air': 'atmos', 'mole_fraction_of_hfc245fa_in_air': 'atmos', 'mole_fraction_of_hfc32_in_air': 'atmos', 'mole_fraction_of_hfc365mfc_in_air': 'atmos', 'mole_fraction_of_hfc4310mee_in_air': 'atmos', 'mole_fraction_of_nitrogen_trifluoride_in_air': 'atmos', 'mole_fraction_of_sulfur_hexafluoride_in_air': 'atmos', 'mole_fraction_of_sulfuryl_fluoride_in_air': 'atmos', 'mole_fraction_of_cfc11_eq_in_air': 'atmos', 'mole_fraction_of_cfc12_eq_in_air': 'atmos', 'mole_fraction_of_hfc134a_eq_in_air': 'atmos', 'solar_irradiance_per_unit_wavelength': 'atmos', 'solar_irradiance': 'atmos', 'areacella': 'atmos'} module-attribute #

Mapping from variable names to realm

The variable names are generally CF standard names (i.e. can include underscores) rather than CMIP data request names (which are meant to have no underscores or other special characters).

TODO: move this into CVs rather than hard-coding here

BoundsInfo #

Definition of the values used for bounds handling

We put this together for ease of explanation and conciseness.

Source code in src/input4mips_validation/inference/from_data.py
@define
class BoundsInfo:
    """
    Definition of the values used for bounds handling

    We put this together for ease of explanation and conciseness.
    """

    time_bounds: str = "time_bounds"
    """
    Name of the variable which represents the bounds of the time axis
    """

    bounds_dim: str = "bounds"
    """
    The name of the bounds dimension in the data
    """

    bounds_dim_lower_val: int = 0
    """
    Value of the lower bounds dimension, which allows us to select the lower bounds.
    """

    bounds_dim_upper_val: int = 1
    """
    Value of the upper bounds dimension, which allows us to select the upper bounds.
    """

    @classmethod
    def from_ds(cls, ds: xr.Dataset, time_dimension: str = "time") -> BoundsInfo:
        """
        Initialise from a dataset

        Parameters
        ----------
        ds
            Dataset from which to initialise
        time_dimension
            The name of the time dimension in the dataset

        Returns
        -------
        :
            Initialised class
        """
        climatology = ds_is_climatology(ds, time_dimension)

        should_have_time_bounds = (time_dimension in ds) and (not climatology)

        if should_have_time_bounds:
            # Has to be like this according to CF-convention
            bounds_info_key = "bounds"
            time_bounds = ds[time_dimension].attrs[bounds_info_key]
            time_bounds_dims = ds[time_bounds].dims
            bounds_dim_l = [v for v in time_bounds_dims if v != time_dimension]
            if len(bounds_dim_l) != 1:
                msg = (
                    f"Expected to find just one non-time dimension for {time_bounds}. "
                    f"Inferred: {bounds_dim_l=}. "
                    f"Original dimensions of {time_bounds}: {time_bounds_dims}"
                )
                raise AssertionError(msg)

            bounds_dim = bounds_dim_l[0]

        else:
            if climatology:
                logger.debug("climatology, guessing bounds info")
            else:
                logger.debug(
                    f"{time_dimension=} not in the dataset, guessing bounds info"
                )

            guesses = ("bounds", "bnds", "nv")
            for guess in guesses:
                if guess in ds.dims:
                    bounds_dim = guess
                    time_bounds = "not_used"
                    logger.debug(
                        f"Found {bounds_dim}, assuming that is the bounds variable"
                    )
                    break

            else:
                msg = (
                    "Could not guess which variable was the bounds variable. "
                    f"Guessed {guesses=}. "
                    f"{ds=}."
                )
                raise AssertionError(msg)

        # Upper, lower
        bounds_dim_expected_size = 2
        if ds[bounds_dim].size != bounds_dim_expected_size:
            raise AssertionError(ds[bounds_dim].size)

        bounds_dim_upper_val = int(ds[bounds_dim].max().values.squeeze())
        bounds_dim_lower_val = int(ds[bounds_dim].min().values.squeeze())

        return cls(
            time_bounds=time_bounds,
            bounds_dim=bounds_dim,
            bounds_dim_lower_val=bounds_dim_lower_val,
            bounds_dim_upper_val=bounds_dim_upper_val,
        )

bounds_dim: str = 'bounds' class-attribute instance-attribute #

The name of the bounds dimension in the data

bounds_dim_lower_val: int = 0 class-attribute instance-attribute #

Value of the lower bounds dimension, which allows us to select the lower bounds.

bounds_dim_upper_val: int = 1 class-attribute instance-attribute #

Value of the upper bounds dimension, which allows us to select the upper bounds.

time_bounds: str = 'time_bounds' class-attribute instance-attribute #

Name of the variable which represents the bounds of the time axis

from_ds(ds, time_dimension='time') classmethod #

Initialise from a dataset

Parameters:

Name Type Description Default
ds Dataset

Dataset from which to initialise

required
time_dimension str

The name of the time dimension in the dataset

'time'

Returns:

Type Description
BoundsInfo

Initialised class

Source code in src/input4mips_validation/inference/from_data.py
@classmethod
def from_ds(cls, ds: xr.Dataset, time_dimension: str = "time") -> BoundsInfo:
    """
    Initialise from a dataset

    Parameters
    ----------
    ds
        Dataset from which to initialise
    time_dimension
        The name of the time dimension in the dataset

    Returns
    -------
    :
        Initialised class
    """
    climatology = ds_is_climatology(ds, time_dimension)

    should_have_time_bounds = (time_dimension in ds) and (not climatology)

    if should_have_time_bounds:
        # Has to be like this according to CF-convention
        bounds_info_key = "bounds"
        time_bounds = ds[time_dimension].attrs[bounds_info_key]
        time_bounds_dims = ds[time_bounds].dims
        bounds_dim_l = [v for v in time_bounds_dims if v != time_dimension]
        if len(bounds_dim_l) != 1:
            msg = (
                f"Expected to find just one non-time dimension for {time_bounds}. "
                f"Inferred: {bounds_dim_l=}. "
                f"Original dimensions of {time_bounds}: {time_bounds_dims}"
            )
            raise AssertionError(msg)

        bounds_dim = bounds_dim_l[0]

    else:
        if climatology:
            logger.debug("climatology, guessing bounds info")
        else:
            logger.debug(
                f"{time_dimension=} not in the dataset, guessing bounds info"
            )

        guesses = ("bounds", "bnds", "nv")
        for guess in guesses:
            if guess in ds.dims:
                bounds_dim = guess
                time_bounds = "not_used"
                logger.debug(
                    f"Found {bounds_dim}, assuming that is the bounds variable"
                )
                break

        else:
            msg = (
                "Could not guess which variable was the bounds variable. "
                f"Guessed {guesses=}. "
                f"{ds=}."
            )
            raise AssertionError(msg)

    # Upper, lower
    bounds_dim_expected_size = 2
    if ds[bounds_dim].size != bounds_dim_expected_size:
        raise AssertionError(ds[bounds_dim].size)

    bounds_dim_upper_val = int(ds[bounds_dim].max().values.squeeze())
    bounds_dim_lower_val = int(ds[bounds_dim].min().values.squeeze())

    return cls(
        time_bounds=time_bounds,
        bounds_dim=bounds_dim,
        bounds_dim_lower_val=bounds_dim_lower_val,
        bounds_dim_upper_val=bounds_dim_upper_val,
    )

FrequencyMetadataKeys #

Definition of the keys used for frequency metadata

We put this together for ease of explanation and conciseness.

Source code in src/input4mips_validation/inference/from_data.py
@define
class FrequencyMetadataKeys:
    """
    Definition of the keys used for frequency metadata

    We put this together for ease of explanation and conciseness.
    """

    frequency_metadata_key: str = "frequency"
    """
    The key in the data's metadata
    which points to information about the data's frequency
    """

    no_time_axis_frequency: str = "fx"
    """
    The value of `frequency_metadata_key` in the metadata which indicates
    that the file has no time axis i.e. is fixed in time.
    """

frequency_metadata_key: str = 'frequency' class-attribute instance-attribute #

The key in the data's metadata which points to information about the data's frequency

no_time_axis_frequency: str = 'fx' class-attribute instance-attribute #

The value of frequency_metadata_key in the metadata which indicates that the file has no time axis i.e. is fixed in time.

create_time_range_for_filename(time_start, time_end, ds_frequency, start_end_separator='-') #

Create the time range information for the filename

It is safest to use this function with the output from infer_time_start_time_end_for_filename because that function correctly infers the start and end time from the data, even when the data represents a climatology.

Parameters:

Name Type Description Default
time_start datetime | datetime | datetime64

The start time (of the underlying dataset)

required
time_end datetime | datetime | datetime64

The end time (of the underlying dataset)

required
ds_frequency str

The frequency of the underlying dataset

required
start_end_separator str

The string(s) to use to separate the start and end time.

'-'

Returns:

Type Description
str

The time-range information, formatted correctly given the underlying dataset's frequency.

Source code in src/input4mips_validation/inference/from_data.py
def create_time_range_for_filename(
    time_start: cftime.datetime | dt.datetime | np.datetime64,
    time_end: cftime.datetime | dt.datetime | np.datetime64,
    ds_frequency: str,
    start_end_separator: str = "-",
) -> str:
    """
    Create the time range information for the filename

    It is safest to use this function with the output from
    [`infer_time_start_time_end_for_filename`][input4mips_validation.inference.from_data.infer_time_start_time_end_for_filename]
    because that function correctly infers the start and end time from the data,
    even when the data represents a climatology.

    Parameters
    ----------
    time_start
        The start time (of the underlying dataset)

    time_end
        The end time (of the underlying dataset)

    ds_frequency
        The frequency of the underlying dataset

    start_end_separator
        The string(s) to use to separate the start and end time.

    Returns
    -------
    :
        The time-range information,
        formatted correctly given the underlying dataset's frequency.
    """
    fd = partial(format_date_for_time_range, ds_frequency=ds_frequency)
    time_start_formatted = fd(time_start)
    time_end_formatted = fd(time_end)

    res = start_end_separator.join([time_start_formatted, time_end_formatted])

    if frequency_is_climatology(ds_frequency):
        res = f"{res}-clim"

    return res

ds_is_climatology(ds, time_dimension) #

Determine whether a dataset represents a climatology or not

Parameters:

Name Type Description Default
ds Dataset

Dataset to check

required
time_dimension str

The name of the time dimension in ds, if ds contains a time dimension

required

Returns:

Type Description
bool

Whether the dataset is a climatology or not

Source code in src/input4mips_validation/inference/from_data.py
def ds_is_climatology(ds: xr.Dataset, time_dimension: str) -> bool:
    """
    Determine whether a dataset represents a climatology or not

    Parameters
    ----------
    ds
        Dataset to check

    time_dimension
        The name of the time dimension in `ds`, if `ds` contains a time dimension

    Returns
    -------
    :
        Whether the dataset is a climatology or not
    """
    if time_dimension in ds:
        # As far as I can tell from the cf-conventions,
        # this is what defines whether something is a climatology or not.
        # See https://cfconventions.org/Data/cf-conventions/cf-conventions-1.12/cf-conventions.html#climatological-statistics
        #
        # > Intervals of climatological time
        # > are conceptually different from ordinary time intervals...
        # > To indicate this difference,
        # > a climatological time coordinate variable does not have a bounds attribute.
        # > Instead, it has a climatology attribute
        ds_is_climatology = "climatology" in ds[time_dimension].attrs
    else:
        ds_is_climatology = False

    return ds_is_climatology

frequency_is_climatology(frequency) #

Check whether the frequency information indicates that the data is a climatology

Parameters:

Name Type Description Default
frequency str

Frequency attribute value

required

Returns:

Type Description
bool

Whether the data represents a climatology or not

Source code in src/input4mips_validation/inference/from_data.py
def frequency_is_climatology(frequency: str) -> bool:
    """
    Check whether the frequency information indicates that the data is a climatology

    Parameters
    ----------
    frequency
        Frequency attribute value

    Returns
    -------
    :
        Whether the data represents a climatology or not
    """
    return frequency in {"monC"}

get_climatology_bounds(ds, time_dimension='time') #

Get the climatology bounds variable

This should only be used after having first checked that ds is a climatology ( using e.g. ds_is_climatology ).

Parameters:

Name Type Description Default
ds Dataset

Dataset

required
time_dimension str

Time dimension in ds

'time'

Returns:

Type Description
DataArray

Climatology bounds variable

Source code in src/input4mips_validation/inference/from_data.py
def get_climatology_bounds(
    ds: xr.Dataset, time_dimension: str = "time"
) -> xr.DataArray:
    """
    Get the climatology bounds variable

    This should only be used after having first checked that `ds`
    is a climatology (
    using e.g.
    [`ds_is_climatology`][input4mips_validation.inference.from_data.ds_is_climatology]
    ).

    Parameters
    ----------
    ds
        Dataset

    time_dimension
        Time dimension in `ds`

    Returns
    -------
    :
        Climatology bounds variable
    """
    # Can do this with confidence as this is what the spec defines.
    # For further details, see comments in `ds_is_climatology`.
    climatology_bounds_var = ds[time_dimension].attrs["climatology"]
    climatology_bounds: xr.DataArray = ds[climatology_bounds_var]

    return climatology_bounds

get_frequency_label_stem(ds, climatology, time_dimension, time_bounds, bounds_dim, bounds_dim_lower_val, bounds_dim_upper_val) #

Get the frequency label's stem from data

This is mainly intended for internal use, see infer_frequency instead.

Parameters:

Name Type Description Default
ds Dataset

Dataset

required
climatology bool

Does this dataset represent a climatology?

required
time_dimension str

Name of the time dimension in ds.

required
time_bounds str

Variable assumed to contain time bounds information

required
bounds_dim str

The name of the bounds dimension

required
bounds_dim_lower_val int

Value of the lower bounds dimension, which allows us to select the lower bounds.

required
bounds_dim_upper_val int

Value of the upper bounds dimension, which allows us to select the upper bounds.

required

Returns:

Type Description
str

Inferred frequency stem e.g. "mon", "yr".

Climatology information is added in infer_frequency.

Source code in src/input4mips_validation/inference/from_data.py
def get_frequency_label_stem(  # noqa: PLR0913
    ds: xr.Dataset,
    climatology: bool,
    time_dimension: str,
    time_bounds: str,
    bounds_dim: str,
    bounds_dim_lower_val: int,
    bounds_dim_upper_val: int,
) -> str:
    """
    Get the frequency label's stem from data

    This is mainly intended for internal use,
    see [`infer_frequency`][input4mips_validation.inference.from_data.infer_frequency]
    instead.

    Parameters
    ----------
    ds
        Dataset

    climatology
        Does this dataset represent a climatology?

    time_dimension
        Name of the time dimension in `ds`.

    time_bounds
        Variable assumed to contain time bounds information

    bounds_dim
        The name of the bounds dimension

    bounds_dim_lower_val
        Value of the lower bounds dimension, which allows us to select the lower bounds.

    bounds_dim_upper_val
        Value of the upper bounds dimension, which allows us to select the upper bounds.

    Returns
    -------
    :
        Inferred frequency stem e.g. "mon", "yr".

        Climatology information is added in
        [`infer_frequency`][input4mips_validation.inference.from_data.infer_frequency].
    """
    if climatology:
        # Only have time to work with, no bounds
        step_start = ds[time_dimension].isel(time=slice(None, -1))
        step_end = ds[time_dimension].isel(time=slice(1, None))

    else:
        # Use compute to avoid any dask stupidity
        step_start = ds[time_bounds].sel({bounds_dim: bounds_dim_lower_val}).compute()
        step_end = ds[time_bounds].sel({bounds_dim: bounds_dim_upper_val}).compute()

    if is_yearly_steps(
        step_start=step_start,
        step_end=step_end,
    ):
        return "yr"

    if is_monthly_steps(
        step_start=step_start,
        step_end=step_end,
    ):
        return "mon"

    if is_daily_steps(
        step_start=step_start,
        step_end=step_end,
    ):
        return "day"

    raise NotImplementedError(ds)

infer_frequency(ds, no_time_axis_frequency, time_dimension='time', time_bounds='time_bounds', bounds_dim='bounds', bounds_dim_lower_val=0, bounds_dim_upper_val=1) #

Infer frequency from data

TODO: work out if/where these rules are captured anywhere else These resource are helpful, but I'm not sure if they spell out the rules exactly:

  • https://github.com/WCRP-CMIP/CMIP6_CVs/blob/main/CMIP6_frequency.json
  • https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf

Parameters:

Name Type Description Default
ds Dataset

Dataset

required
no_time_axis_frequency str

Value to return if the data has no time axis i.e. is a fixed field.

required
time_dimension str

Name of the expected time dimension in ds.

If time_dimension is not in ds, we assume the data is a fixed field.

'time'
time_bounds str

Variable assumed to contain time bounds information

'time_bounds'
bounds_dim str

The name of the bounds dimension

'bounds'
bounds_dim_lower_val int

Value of the lower bounds dimension, which allows us to select the lower bounds.

0
bounds_dim_upper_val int

Value of the upper bounds dimension, which allows us to select the upper bounds.

1

Returns:

Type Description
str

Inferred frequency

Source code in src/input4mips_validation/inference/from_data.py
def infer_frequency(  # noqa: PLR0913
    ds: xr.Dataset,
    no_time_axis_frequency: str,
    time_dimension: str = "time",
    time_bounds: str = "time_bounds",
    bounds_dim: str = "bounds",
    bounds_dim_lower_val: int = 0,
    bounds_dim_upper_val: int = 1,
) -> str:
    """
    Infer frequency from data

    TODO: work out if/where these rules are captured anywhere else
    These resource are helpful, but I'm not sure if they spell out the rules exactly:

    - https://github.com/WCRP-CMIP/CMIP6_CVs/blob/main/CMIP6_frequency.json
    - https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf

    Parameters
    ----------
    ds
        Dataset

    no_time_axis_frequency
        Value to return if the data has no time axis i.e. is a fixed field.

    time_dimension
        Name of the expected time dimension in `ds`.

        If `time_dimension` is not in `ds`, we assume the data is a fixed field.

    time_bounds
        Variable assumed to contain time bounds information

    bounds_dim
        The name of the bounds dimension

    bounds_dim_lower_val
        Value of the lower bounds dimension, which allows us to select the lower bounds.

    bounds_dim_upper_val
        Value of the upper bounds dimension, which allows us to select the upper bounds.

    Returns
    -------
    :
        Inferred frequency
    """
    if time_dimension not in ds:
        logger.debug(f"{time_dimension=} not in {ds=}, assuming fixed field")
        # Fixed field
        return no_time_axis_frequency

    climatology = ds_is_climatology(ds, time_dimension)

    frequency_stem = get_frequency_label_stem(
        ds=ds,
        climatology=climatology,
        time_dimension=time_dimension,
        time_bounds=time_bounds,
        bounds_dim=bounds_dim,
        bounds_dim_lower_val=bounds_dim_lower_val,
        bounds_dim_upper_val=bounds_dim_upper_val,
    )

    if climatology:
        if frequency_stem == "mon":
            frequency_label = f"{frequency_stem}C"

        else:
            # Apparently 1hrCM is also a thing, not implemented (yet)
            msg = f"{climatology=} and {frequency_stem=}"
            raise NotImplementedError(msg)
    else:
        frequency_label = frequency_stem

    return frequency_label

infer_time_start_time_end_for_filename(ds, frequency_metadata_key, no_time_axis_frequency, time_dimension) #

Infer start and end time of the data in a dataset for creating file names

Parameters:

Name Type Description Default
ds Dataset

Dataset from which to infer start and end time

required
frequency_metadata_key str

The key in the data's metadata which points to information about the data's frequency

required
no_time_axis_frequency str

The value of frequency_metadata_key in the metadata which indicates that the file has no time axis i.e. is fixed in time.

required
time_dimension str

The time dimension of the data

required

Returns:

Name Type Description
time_start Union[datetime, datetime, datetime64, None]

Start time of the data

time_end Union[datetime, datetime, datetime64, None]

End time of the data

Source code in src/input4mips_validation/inference/from_data.py
def infer_time_start_time_end_for_filename(
    ds: xr.Dataset,
    frequency_metadata_key: str,
    no_time_axis_frequency: str,
    time_dimension: str,
) -> tuple[
    Union[cftime.datetime, dt.datetime, np.datetime64, None],
    Union[cftime.datetime, dt.datetime, np.datetime64, None],
]:
    """
    Infer start and end time of the data in a dataset for creating file names

    Parameters
    ----------
    ds
        Dataset from which to infer start and end time

    frequency_metadata_key
        The key in the data's metadata
        which points to information about the data's frequency

    no_time_axis_frequency
        The value of `frequency_metadata_key` in the metadata which indicates
        that the file has no time axis i.e. is fixed in time.

    time_dimension
        The time dimension of the data

    Returns
    -------
    time_start :
        Start time of the data

    time_end :
        End time of the data
    """
    frequency = ds.attrs[frequency_metadata_key]
    is_climatology = frequency_is_climatology(frequency)

    if frequency == no_time_axis_frequency:
        time_start: Union[cftime.datetime, dt.datetime, np.datetime64, None] = None
        time_end: Union[cftime.datetime, dt.datetime, np.datetime64, None] = None

    elif is_climatology:
        # Can do this with confidence as this is what the spec defines.
        # See comments in `ds_is_climatology`.
        climatology_bounds = get_climatology_bounds(ds, time_dimension=time_dimension)

        time_start = xr_time_min_max_to_single_value(climatology_bounds.min())
        time_end = xr_time_min_max_to_single_value(climatology_bounds.max())
        if isinstance(time_end, np.datetime64):
            raise TypeError(time_end)

        if frequency == "monC":
            # If first day of month,
            # roll back one day to reflect the fact that the bound is exclusive.
            if time_end.day == 1:
                time_end = time_end - dt.timedelta(days=1)

    else:
        time_start = xr_time_min_max_to_single_value(ds[time_dimension].min())
        time_end = xr_time_min_max_to_single_value(ds[time_dimension].max())

    return time_start, time_end

is_daily_steps(step_start, step_end) #

Determine whether the steps are daily

Parameters:

Name Type Description Default
step_start DataArray

Start of each step (e.g. start of each bound)

required
step_end DataArray

End of each step (e.g. end of each bound)

required

Returns:

Type Description
bool

True if the steps are daily, otherwise False

Source code in src/input4mips_validation/inference/from_data.py
def is_daily_steps(
    step_start: xr.DataArray,
    step_end: xr.DataArray,
) -> bool:
    """
    Determine whether the steps are daily

    Parameters
    ----------
    step_start
        Start of each step (e.g. start of each bound)

    step_end
        End of each step (e.g. end of each bound)

    Returns
    -------
    :
        `True` if the steps are daily, otherwise `False`
    """
    # Use compute to avoid any dask stupidity
    step_start = step_start.compute()
    step_end = step_end.compute()
    time_deltas = step_end - step_start

    return bool((time_deltas.dt.days == 1).all())

is_monthly_steps(step_start, step_end) #

Determine whether the steps are monthly

Parameters:

Name Type Description Default
step_start DataArray

Start of each step (e.g. start of each bound)

required
step_end DataArray

End of each step (e.g. end of each bound)

required

Returns:

Type Description
bool

True if the steps are monthly, otherwise False

Source code in src/input4mips_validation/inference/from_data.py
def is_monthly_steps(
    step_start: xr.DataArray,
    step_end: xr.DataArray,
) -> bool:
    """
    Determine whether the steps are monthly

    Parameters
    ----------
    step_start
        Start of each step (e.g. start of each bound)

    step_end
        End of each step (e.g. end of each bound)

    Returns
    -------
    :
        `True` if the steps are monthly, otherwise `False`
    """
    # # Urgh this doesn't work because October 5 to October 14 1582 (inclusive)
    # # don't exist in the mixed Julian/Gregorian calendar,
    # # so you don't get the right number of days for October 1582
    # # if you do it like this.
    # ```
    # timestep_size = (step_end - step_start).dt.days
    #
    # MIN_DAYS_IN_MONTH = 28
    # MAX_DAYS_IN_MONTH = 31
    # is_monthly_steps = (
    #     (timestep_size >= MIN_DAYS_IN_MONTH)
    #     & (timestep_size <= MAX_DAYS_IN_MONTH)
    # ).all()
    # ```
    #
    # # Hence have to use the hack below instead.
    month_diff = step_end.dt.month.values - step_start.dt.month.values
    year_diff = step_end.dt.year.values - step_start.dt.year.values

    MONTH_DIFF_IF_END_OF_YEAR = -11
    is_monthly_steps = (
        (month_diff == 1)
        | ((month_diff == MONTH_DIFF_IF_END_OF_YEAR) & (year_diff == 1))
    ).all()

    return bool(is_monthly_steps)

is_yearly_steps(step_start, step_end) #

Determine whether the steps are yearly

Parameters:

Name Type Description Default
step_start DataArray

Start of each step (e.g. start of each bound)

required
step_end DataArray

End of each step (e.g. end of each bound)

required

Returns:

Type Description
bool

True if the steps are yearly, otherwise False

Source code in src/input4mips_validation/inference/from_data.py
def is_yearly_steps(
    step_start: xr.DataArray,
    step_end: xr.DataArray,
) -> bool:
    """
    Determine whether the steps are yearly

    Parameters
    ----------
    step_start
        Start of each step (e.g. start of each bound)

    step_end
        End of each step (e.g. end of each bound)

    Returns
    -------
    :
        `True` if the steps are yearly, otherwise `False`
    """
    month_diff = step_end.dt.month.values - step_start.dt.month.values
    year_diff = step_end.dt.year.values - step_start.dt.year.values

    is_yearly_steps = ((month_diff == 0) & (year_diff == 1)).all()

    return bool(is_yearly_steps)