input4mips_validation.dataset#

Sub-package	Description
dataset	Dataset class definition
metadata	Metadata for `Input4MIPsDataset` objects
metadata_data_producer_minimum	Minimum metadata required from an input4MIPs dataset producer
metadata_data_producer_multiple_variable_minimum	Minimum metadata required from an input4MIPs dataset producer for a multi-variable file

`input4mips_validation.dataset` #

Classes that define an input4MIPs dataset and associated metadata

`Input4MIPsDataset` #

Representation of an input4MIPs dataset

For validation, see [TODO: validate_input4mips_ds function and then cross-ref here].

Source code in src/input4mips_validation/dataset/dataset.py

@frozen
class Input4MIPsDataset:
    """
    Representation of an input4MIPs dataset

    For validation, see
    [TODO: `validate_input4mips_ds` function and then cross-ref here].
    """

    data: xr.Dataset
    """
    Data
    """

    metadata: Input4MIPsDatasetMetadata
    """
    Metadata
    """

    cvs: Input4MIPsCVs = field()
    """
    Controlled vocabularies to use with this dataset

    If not supplied, we create these with
    [`load_cvs`][input4mips_validation.cvs.loading.load_cvs]
    """

    non_input4mips_metadata: Optional[dict[str, str]] = field(default=None)
    """
    Metadata that isn't part of input4MIPs' data model
    This will simply be written as attributes to the file,
    as long as it doesn't clash with any of the input4MIPs keys.
    """

    @non_input4mips_metadata.validator
    def _no_clash_with_metadata_attributes(
        self, attribute: attr.Attribute[Any], value: dict[str, Any] | None
    ) -> None:
        if value is None:
            return

        clashing_keys = [key for key in value if key in asdict(self.metadata).keys()]
        if clashing_keys:
            msg = (
                f"{attribute.name} must not contain any keys "
                "that clash with the `self.metadata`. "
                f"Keys in both {attribute.name} and `self.metadata`: {clashing_keys}"
            )
            raise AssertionError(msg)

    @cvs.default
    def _load_default_cvs(self) -> Input4MIPsCVs:
        return load_cvs()

    @classmethod
    def from_ds(
        cls,
        ds: xr.Dataset,
        cvs: Input4MIPsCVs | None,
    ) -> Input4MIPsDataset:
        """
        Initialise from an existing dataset

        Parameters
        ----------
        ds
            Dataset from which to initialise.
            We infer the metdata from `ds.attrs`.

        cvs
            Controlled vocabularies to use with the dataset

        Returns
        -------
            Initialised instance
        """
        ds_stripped = ds.copy()
        ds_stripped.attrs = {}

        metadata_fields = [
            f.name for f in fields(Input4MIPsDatasetMetadata) if f.name in ds.attrs
        ]
        metadata = Input4MIPsDatasetMetadata(
            **{k: ds.attrs[k] for k in metadata_fields}
        )
        non_input4mips_metadata = {
            k: v for k, v in ds.attrs.items() if k not in metadata_fields
        }

        if cvs is None:
            res = Input4MIPsDataset(
                data=ds_stripped,
                metadata=metadata,
                non_input4mips_metadata=non_input4mips_metadata,
            )

        else:
            res = Input4MIPsDataset(
                data=ds_stripped,
                metadata=metadata,
                non_input4mips_metadata=non_input4mips_metadata,
                cvs=cvs,
            )

        return res

    @classmethod
    def from_data_producer_minimum_information(  # noqa: PLR0913
        cls,
        data: xr.Dataset,
        metadata_minimum: Input4MIPsDatasetMetadataDataProducerMinimum,
        cvs: Input4MIPsCVs | None = None,
        prepare_func: PrepareFuncLike | None = None,
        copy_ds: bool = True,
        activity_id: str = "input4MIPs",
        dataset_category: str | None = None,
        realm: str | None = None,
        xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
    ) -> Input4MIPsDataset:
        """
        Initialise from the minimum information required from the data producer

        This applies to dataset's that have a single variable.
        For multi-variable datasets, see
        [`from_data_producer_minimum_information_multiple_variable`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information_multiple_variable].

        Parameters
        ----------
        data
            Raw data

        metadata_minimum
            Minimum metadata required from the data producer

        cvs
            CVs to use for inference and validation

            If not supplied, this will be retrieved with
            [`load_cvs`][input4mips_validation.cvs.load_cvs]

        prepare_func
            Function to use to prepare the data, retrieve source ID values from the CVs
            and infer the frequency metadata.

            If not supplied, we use
            [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

        copy_ds
            Should `ds` be copied before we create the `Input4MIPsDataset`?

        activity_id
            Activity ID that applies to the dataset.

            Given this is an Input4MIPsDataset, you shouldn't need to change this.

        dataset_category
            The category of the data.

            If not supplied, we will try and infer this based on
            [`VARIABLE_DATASET_CATEGORY_MAP`][input4mips_validation.inference.from_data.VARIABLE_DATASET_CATEGORY_MAP].

        realm
            The realm of the data.

            If not supplied, we will try and infer this based on
            [`VARIABLE_REALM_MAP`][input4mips_validation.inference.from_data.VARIABLE_REALM_MAP].

        xr_variable_processor
            Helper to use for processing the variables in xarray objects.

        Returns
        -------
        :
            Initialised instance
        """
        variable_id = get_ds_var_assert_single(
            data, xr_variable_processor=xr_variable_processor
        )

        ### These lines are exactly the same as in
        # `from_data_producer_minimum_information_multiple_variable`.
        # This is on purpose, the extra layer of abstraction
        # and coupling isn't worth it right now.
        if cvs is None:
            cvs = load_cvs()

        if prepare_func is None:
            prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
        else:
            prepare_func_use = prepare_func

        if copy_ds:
            data = data.copy()

        data, frequency = prepare_func_use(
            ds_raw=data,
            # Copying handled above
            copy_ds=False,
        )

        # [TODO: remove this once we are confident in our license checks]
        cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
        cvs_source_id_values = cvs_source_id_entry.values
        if cvs_source_id_values.license_id is None:
            msg = "License ID must be specified in the CVs source ID"
            raise AssertionError(msg)
        ### End of identical lines

        if dataset_category is None:
            dataset_category = VARIABLE_DATASET_CATEGORY_MAP[variable_id]

        if realm is None:
            realm = VARIABLE_REALM_MAP[variable_id]

        metadata = Input4MIPsDatasetMetadata(
            activity_id=activity_id,
            contact=cvs_source_id_values.contact,
            dataset_category=dataset_category,
            frequency=frequency,
            further_info_url=cvs_source_id_values.further_info_url,
            grid_label=metadata_minimum.grid_label,
            # # TODO: look this up from central CVs
            # institution=cvs_source_id_values.institution,
            institution_id=cvs_source_id_values.institution_id,
            license=cvs.license_entries[
                cvs_source_id_values.license_id
            ].values.conditions,
            license_id=cvs_source_id_values.license_id,
            mip_era=cvs_source_id_values.mip_era,
            nominal_resolution=metadata_minimum.nominal_resolution,
            realm=realm,
            source_id=metadata_minimum.source_id,
            source_version=cvs_source_id_values.source_version,
            target_mip=metadata_minimum.target_mip,
            variable_id=variable_id,
        )

        return cls(data=data, metadata=metadata, cvs=cvs)

    @classmethod
    def from_data_producer_minimum_information_multiple_variable(  # noqa: PLR0913
        cls,
        data: xr.Dataset,
        metadata_minimum: Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum,
        cvs: Input4MIPsCVs | None = None,
        prepare_func: PrepareFuncLike | None = None,
        copy_ds: bool = True,
        activity_id: str = "input4MIPs",
        variable_id: str = "multiple",
    ) -> Input4MIPsDataset:
        """
        Initialise from the minimum information required from the data producer

        This applies to dataset's that have multiple variables.
        For single variable datasets, see
        [`from_data_producer_minimum_information`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information].

        Parameters
        ----------
        data
            Raw data

        metadata_minimum
            Minimum metadata required from the data producer

        cvs
            CVs to use for inference and validation

            If not supplied, this will be retrieved with
            [`load_cvs`][input4mips_validation.cvs.loading.load_cvs].

        prepare_func
            Function to use to prepare the data, retrieve source ID values from the CVs
            and infer the frequency metadata.

            If not supplied, we use
            [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

        copy_ds
            Should `ds` be copied before we create the `Input4MIPsDataset`?

        activity_id
            Activity ID that applies to the dataset.

            Given this is an Input4MIPsDataset, you shouldn't need to change this.

        variable_id
            The variable ID to use.

            For multi-variable datasets, as far as we are aware,
            this is always "multiple", hence you shouldn't need to change the defaults.

        Returns
        -------
        :
            Initialised instance
        """
        ### These lines are exactly the same as in
        # `from_data_producer_minimum_information`.
        # This is on purpose, the extra layer of abstraction
        # and coupling isn't worth it right now.
        if cvs is None:
            cvs = load_cvs()

        if prepare_func is None:
            prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
        else:
            prepare_func_use = prepare_func

        if copy_ds:
            data = data.copy()

        data, frequency = prepare_func_use(
            ds_raw=data,
            # Copying handled above
            copy_ds=False,
        )

        # [TODO: remove this once we are confident in our license checks]
        cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
        cvs_source_id_values = cvs_source_id_entry.values
        if cvs_source_id_values.license_id is None:
            msg = "License ID must be specified in the CVs source ID"
            raise AssertionError(msg)
        ### End of identical lines

        metadata = Input4MIPsDatasetMetadata(
            activity_id=activity_id,
            contact=cvs_source_id_values.contact,
            dataset_category=metadata_minimum.dataset_category,
            frequency=frequency,
            further_info_url=cvs_source_id_values.further_info_url,
            grid_label=metadata_minimum.grid_label,
            # # TODO: look this up from central CVs
            # institution=cvs_values.institution,
            institution_id=cvs_source_id_values.institution_id,
            license=cvs.license_entries[
                cvs_source_id_values.license_id
            ].values.conditions,
            license_id=cvs_source_id_values.license_id,
            mip_era=cvs_source_id_values.mip_era,
            nominal_resolution=metadata_minimum.nominal_resolution,
            realm=metadata_minimum.realm,
            source_id=metadata_minimum.source_id,
            source_version=cvs_source_id_values.source_version,
            target_mip=metadata_minimum.target_mip,
            variable_id=variable_id,
        )

        return cls(data=data, metadata=metadata, cvs=cvs)

    def get_out_path_and_disk_ready_dataset(
        self,
        root_data_dir: Path,
        pint_dequantify_format: str = "cf",
        frequency_metadata_keys: FrequencyMetadataKeys = FrequencyMetadataKeys(),
        time_dimension: str = "time",
    ) -> tuple[Path, xr.Dataset]:
        """
        Get path in which to write and a disk-ready dataset

        Parameters
        ----------
        root_data_dir
            Root directory in which to write the file

        pint_dequantify_format
            Format to use when dequantifying variables with Pint.

            It is unlikely that you will want to change this.

        frequency_metadata_keys
            Metadata definitions for frequency information

        time_dimension
            The time dimension of the data.

            Required so that we know
            what information to pass to the path generating algorithm,
            in case the path generating algorithm requires time axis information.

        Returns
        -------
        :
            Path in which to write the file
            and the [iris.cube.Cube][]'s to write in the file.

        Notes
        -----
        You will generally not want to write the output of this directly to disk,
        because it will not be CF-compliant.
        To see how to write CF-compliant files,
        see [`write`][input4mips_validation.dataset.Input4MIPsDataset.write].

        See Also
        --------
        [`write`][input4mips_validation.dataset.Input4MIPsDataset.write]
        """
        cvs = self.cvs

        # Can shallow copy as we don't alter the data from here on
        ds_disk = self.data.copy(deep=False)
        try:
            ds_disk = ds_disk.pint.dequantify(format=pint_dequantify_format)
        except AttributeError:
            logger.debug(
                "Not dequantifying with pint, "
                "I assume you know what you're doing with units"
            )

        # Add all the metadata
        ds_disk.attrs = convert_input4mips_metadata_to_ds_attrs(self.metadata)
        if self.non_input4mips_metadata is not None:
            # Merge the metadata.
            # Validation ensures that there will be no clash of keys.
            ds_disk.attrs = (
                self.non_input4mips_metadata
                | convert_input4mips_metadata_to_ds_attrs(self.metadata)
            )

        else:
            ds_disk.attrs = convert_input4mips_metadata_to_ds_attrs(self.metadata)

        # Must be unique for every written file,
        # so we deliberately don't provide a way
        # for the user to overwrite this at present
        # and we deliberately overwrite any existing values.
        ds_disk.attrs["tracking_id"] = generate_tracking_id()
        ds_disk.attrs["creation_date"] = generate_creation_timestamp()

        time_start, time_end = infer_time_start_time_end_for_filename(
            ds=ds_disk,
            frequency_metadata_key=frequency_metadata_keys.frequency_metadata_key,
            no_time_axis_frequency=frequency_metadata_keys.no_time_axis_frequency,
            time_dimension=time_dimension,
        )

        out_path = cvs.DRS.get_file_path(
            root_data_dir=root_data_dir,
            available_attributes=ds_disk.attrs,
            time_start=time_start,
            time_end=time_end,
        )

        return out_path, ds_disk

    def write(  # noqa: PLR0913
        self,
        root_data_dir: Path,
        pint_dequantify_format: str = "cf",
        unlimited_dimensions: tuple[str, ...] = ("time",),
        frequency_metadata_keys: FrequencyMetadataKeys = FrequencyMetadataKeys(),
        time_dimension: str = "time",
        xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
        bounds_info: BoundsInfo | None = None,
    ) -> Path:
        """
        Write to disk

        This takes a very opionated view of how to write to disk.
        If you need to alter this, please take the source code of this method
        as a template then alter as required.

        Parameters
        ----------
        root_data_dir
            Root directory in which to write the file

        pint_dequantify_format
            Format to use when dequantifying variables with Pint.

            It is unlikely that you will want to change this.
            If you are not using pint for unit handling, this will be ignored.

        unlimited_dimensions
            Dimensions which should be unlimited in the written file

            This is passed to [iris.save][].

        frequency_metadata_keys
            Metadata definitions for frequency information

        time_dimension
            The time dimension of the data.

            Required so that we know
            what information to pass to the path generating algorithm,
            in case the path generating algorithm requires time axis information.

        xr_variable_processor
            Helper to use for processing the variables in xarray objects.

        bounds_info
            Metadata definitions for bounds handling

            If `None`, this will be inferred from `ds`.

        Returns
        -------
        :
            Path in which the file was written
        """
        out_path, ds_disk_ready = self.get_out_path_and_disk_ready_dataset(
            root_data_dir=root_data_dir,
            pint_dequantify_format=pint_dequantify_format,
            frequency_metadata_keys=frequency_metadata_keys,
            time_dimension=time_dimension,
        )

        # Validate
        # As part of https://github.com/climate-resource/input4mips_validation/issues/14
        # add final validation here for bullet proofness
        # - tracking ID, creation date, comparison with DRS from cvs etc.
        validation_result = get_ds_to_write_to_disk_validation_result(
            ds=ds_disk_ready,
            out_path=out_path,
            cvs=self.cvs,
            xr_variable_processor=xr_variable_processor,
            frequency_metadata_keys=frequency_metadata_keys,
            bounds_info=bounds_info,
        )
        validation_result.raise_if_errors()

        # Convert to cubes with ncdata
        cubes = ncdata.iris_xarray.cubes_from_xarray(ds_disk_ready)

        # Having validated and converted to cubes, make the target directory.
        out_path.parent.mkdir(parents=True, exist_ok=True)

        # Write the file to disk
        iris.save(
            cubes,
            out_path,
            unlimited_dimensions=unlimited_dimensions,
        )

        return out_path

`cvs: Input4MIPsCVs = field()` `class-attribute` `instance-attribute` #

Controlled vocabularies to use with this dataset

If not supplied, we create these with load_cvs

`data: xr.Dataset` `instance-attribute` #

Data

`metadata: Input4MIPsDatasetMetadata` `instance-attribute` #

Metadata

`non_input4mips_metadata: Optional[dict[str, str]] = field(default=None)` `class-attribute` `instance-attribute` #

Metadata that isn't part of input4MIPs' data model This will simply be written as attributes to the file, as long as it doesn't clash with any of the input4MIPs keys.

`from_data_producer_minimum_information(data, metadata_minimum, cvs=None, prepare_func=None, copy_ds=True, activity_id='input4MIPs', dataset_category=None, realm=None, xr_variable_processor=XRVariableHelper())` `classmethod` #

Initialise from the minimum information required from the data producer

This applies to dataset's that have a single variable. For multi-variable datasets, see from_data_producer_minimum_information_multiple_variable.

Parameters:

Name	Type	Description	Default
`data`	`Dataset`	Raw data	required
`metadata_minimum`	`Input4MIPsDatasetMetadataDataProducerMinimum`	Minimum metadata required from the data producer	required
`cvs`	`Input4MIPsCVs \| None`	CVs to use for inference and validation If not supplied, this will be retrieved with `load_cvs`	`None`
`prepare_func`	`PrepareFuncLike \| None`	Function to use to prepare the data, retrieve source ID values from the CVs and infer the frequency metadata. If not supplied, we use [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].	`None`
`copy_ds`	`bool`	Should `ds` be copied before we create the `Input4MIPsDataset`?	`True`
`activity_id`	`str`	Activity ID that applies to the dataset. Given this is an Input4MIPsDataset, you shouldn't need to change this.	`'input4MIPs'`
`dataset_category`	`str \| None`	The category of the data. If not supplied, we will try and infer this based on `VARIABLE_DATASET_CATEGORY_MAP`.	`None`
`realm`	`str \| None`	The realm of the data. If not supplied, we will try and infer this based on `VARIABLE_REALM_MAP`.	`None`
`xr_variable_processor`	`XRVariableProcessorLike`	Helper to use for processing the variables in xarray objects.	`XRVariableHelper()`

Returns:

Type	Description
`Input4MIPsDataset`	Initialised instance

Source code in src/input4mips_validation/dataset/dataset.py

@classmethod
def from_data_producer_minimum_information(  # noqa: PLR0913
    cls,
    data: xr.Dataset,
    metadata_minimum: Input4MIPsDatasetMetadataDataProducerMinimum,
    cvs: Input4MIPsCVs | None = None,
    prepare_func: PrepareFuncLike | None = None,
    copy_ds: bool = True,
    activity_id: str = "input4MIPs",
    dataset_category: str | None = None,
    realm: str | None = None,
    xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
) -> Input4MIPsDataset:
    """
    Initialise from the minimum information required from the data producer

    This applies to dataset's that have a single variable.
    For multi-variable datasets, see
    [`from_data_producer_minimum_information_multiple_variable`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information_multiple_variable].

    Parameters
    ----------
    data
        Raw data

    metadata_minimum
        Minimum metadata required from the data producer

    cvs
        CVs to use for inference and validation

        If not supplied, this will be retrieved with
        [`load_cvs`][input4mips_validation.cvs.load_cvs]

    prepare_func
        Function to use to prepare the data, retrieve source ID values from the CVs
        and infer the frequency metadata.

        If not supplied, we use
        [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

    copy_ds
        Should `ds` be copied before we create the `Input4MIPsDataset`?

    activity_id
        Activity ID that applies to the dataset.

        Given this is an Input4MIPsDataset, you shouldn't need to change this.

    dataset_category
        The category of the data.

        If not supplied, we will try and infer this based on
        [`VARIABLE_DATASET_CATEGORY_MAP`][input4mips_validation.inference.from_data.VARIABLE_DATASET_CATEGORY_MAP].

    realm
        The realm of the data.

        If not supplied, we will try and infer this based on
        [`VARIABLE_REALM_MAP`][input4mips_validation.inference.from_data.VARIABLE_REALM_MAP].

    xr_variable_processor
        Helper to use for processing the variables in xarray objects.

    Returns
    -------
    :
        Initialised instance
    """
    variable_id = get_ds_var_assert_single(
        data, xr_variable_processor=xr_variable_processor
    )

    ### These lines are exactly the same as in
    # `from_data_producer_minimum_information_multiple_variable`.
    # This is on purpose, the extra layer of abstraction
    # and coupling isn't worth it right now.
    if cvs is None:
        cvs = load_cvs()

    if prepare_func is None:
        prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
    else:
        prepare_func_use = prepare_func

    if copy_ds:
        data = data.copy()

    data, frequency = prepare_func_use(
        ds_raw=data,
        # Copying handled above
        copy_ds=False,
    )

    # [TODO: remove this once we are confident in our license checks]
    cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
    cvs_source_id_values = cvs_source_id_entry.values
    if cvs_source_id_values.license_id is None:
        msg = "License ID must be specified in the CVs source ID"
        raise AssertionError(msg)
    ### End of identical lines

    if dataset_category is None:
        dataset_category = VARIABLE_DATASET_CATEGORY_MAP[variable_id]

    if realm is None:
        realm = VARIABLE_REALM_MAP[variable_id]

    metadata = Input4MIPsDatasetMetadata(
        activity_id=activity_id,
        contact=cvs_source_id_values.contact,
        dataset_category=dataset_category,
        frequency=frequency,
        further_info_url=cvs_source_id_values.further_info_url,
        grid_label=metadata_minimum.grid_label,
        # # TODO: look this up from central CVs
        # institution=cvs_source_id_values.institution,
        institution_id=cvs_source_id_values.institution_id,
        license=cvs.license_entries[
            cvs_source_id_values.license_id
        ].values.conditions,
        license_id=cvs_source_id_values.license_id,
        mip_era=cvs_source_id_values.mip_era,
        nominal_resolution=metadata_minimum.nominal_resolution,
        realm=realm,
        source_id=metadata_minimum.source_id,
        source_version=cvs_source_id_values.source_version,
        target_mip=metadata_minimum.target_mip,
        variable_id=variable_id,
    )

    return cls(data=data, metadata=metadata, cvs=cvs)

`from_data_producer_minimum_information_multiple_variable(data, metadata_minimum, cvs=None, prepare_func=None, copy_ds=True, activity_id='input4MIPs', variable_id='multiple')` `classmethod` #

Initialise from the minimum information required from the data producer

This applies to dataset's that have multiple variables. For single variable datasets, see from_data_producer_minimum_information.

Parameters:

Name	Type	Description	Default
`data`	`Dataset`	Raw data	required
`metadata_minimum`	`Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum`	Minimum metadata required from the data producer	required
`cvs`	`Input4MIPsCVs \| None`	CVs to use for inference and validation If not supplied, this will be retrieved with `load_cvs`.	`None`
`prepare_func`	`PrepareFuncLike \| None`	Function to use to prepare the data, retrieve source ID values from the CVs and infer the frequency metadata. If not supplied, we use [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].	`None`
`copy_ds`	`bool`	Should `ds` be copied before we create the `Input4MIPsDataset`?	`True`
`activity_id`	`str`	Activity ID that applies to the dataset. Given this is an Input4MIPsDataset, you shouldn't need to change this.	`'input4MIPs'`
`variable_id`	`str`	The variable ID to use. For multi-variable datasets, as far as we are aware, this is always "multiple", hence you shouldn't need to change the defaults.	`'multiple'`

Returns:

Type	Description
`Input4MIPsDataset`	Initialised instance

Source code in src/input4mips_validation/dataset/dataset.py

@classmethod
def from_data_producer_minimum_information_multiple_variable(  # noqa: PLR0913
    cls,
    data: xr.Dataset,
    metadata_minimum: Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum,
    cvs: Input4MIPsCVs | None = None,
    prepare_func: PrepareFuncLike | None = None,
    copy_ds: bool = True,
    activity_id: str = "input4MIPs",
    variable_id: str = "multiple",
) -> Input4MIPsDataset:
    """
    Initialise from the minimum information required from the data producer

    This applies to dataset's that have multiple variables.
    For single variable datasets, see
    [`from_data_producer_minimum_information`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information].

    Parameters
    ----------
    data
        Raw data

    metadata_minimum
        Minimum metadata required from the data producer

    cvs
        CVs to use for inference and validation

        If not supplied, this will be retrieved with
        [`load_cvs`][input4mips_validation.cvs.loading.load_cvs].

    prepare_func
        Function to use to prepare the data, retrieve source ID values from the CVs
        and infer the frequency metadata.

        If not supplied, we use
        [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

    copy_ds
        Should `ds` be copied before we create the `Input4MIPsDataset`?

    activity_id
        Activity ID that applies to the dataset.

        Given this is an Input4MIPsDataset, you shouldn't need to change this.

    variable_id
        The variable ID to use.

        For multi-variable datasets, as far as we are aware,
        this is always "multiple", hence you shouldn't need to change the defaults.

    Returns
    -------
    :
        Initialised instance
    """
    ### These lines are exactly the same as in
    # `from_data_producer_minimum_information`.
    # This is on purpose, the extra layer of abstraction
    # and coupling isn't worth it right now.
    if cvs is None:
        cvs = load_cvs()

    if prepare_func is None:
        prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
    else:
        prepare_func_use = prepare_func

    if copy_ds:
        data = data.copy()

    data, frequency = prepare_func_use(
        ds_raw=data,
        # Copying handled above
        copy_ds=False,
    )

    # [TODO: remove this once we are confident in our license checks]
    cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
    cvs_source_id_values = cvs_source_id_entry.values
    if cvs_source_id_values.license_id is None:
        msg = "License ID must be specified in the CVs source ID"
        raise AssertionError(msg)
    ### End of identical lines

    metadata = Input4MIPsDatasetMetadata(
        activity_id=activity_id,
        contact=cvs_source_id_values.contact,
        dataset_category=metadata_minimum.dataset_category,
        frequency=frequency,
        further_info_url=cvs_source_id_values.further_info_url,
        grid_label=metadata_minimum.grid_label,
        # # TODO: look this up from central CVs
        # institution=cvs_values.institution,
        institution_id=cvs_source_id_values.institution_id,
        license=cvs.license_entries[
            cvs_source_id_values.license_id
        ].values.conditions,
        license_id=cvs_source_id_values.license_id,
        mip_era=cvs_source_id_values.mip_era,
        nominal_resolution=metadata_minimum.nominal_resolution,
        realm=metadata_minimum.realm,
        source_id=metadata_minimum.source_id,
        source_version=cvs_source_id_values.source_version,
        target_mip=metadata_minimum.target_mip,
        variable_id=variable_id,
    )

    return cls(data=data, metadata=metadata, cvs=cvs)

`from_ds(ds, cvs)` `classmethod` #

Initialise from an existing dataset

Parameters:

Name	Type	Description	Default
`ds`	`Dataset`	Dataset from which to initialise. We infer the metdata from `ds.attrs`.	required
`cvs`	`Input4MIPsCVs \| None`	Controlled vocabularies to use with the dataset	required

Returns:

Type	Description
`Initialised instance`

Source code in src/input4mips_validation/dataset/dataset.py

@classmethod
def from_ds(
    cls,
    ds: xr.Dataset,
    cvs: Input4MIPsCVs | None,
) -> Input4MIPsDataset:
    """
    Initialise from an existing dataset

    Parameters
    ----------
    ds
        Dataset from which to initialise.
        We infer the metdata from `ds.attrs`.

    cvs
        Controlled vocabularies to use with the dataset

    Returns
    -------
        Initialised instance
    """
    ds_stripped = ds.copy()
    ds_stripped.attrs = {}

    metadata_fields = [
        f.name for f in fields(Input4MIPsDatasetMetadata) if f.name in ds.attrs
    ]
    metadata = Input4MIPsDatasetMetadata(
        **{k: ds.attrs[k] for k in metadata_fields}
    )
    non_input4mips_metadata = {
        k: v for k, v in ds.attrs.items() if k not in metadata_fields
    }

    if cvs is None:
        res = Input4MIPsDataset(
            data=ds_stripped,
            metadata=metadata,
            non_input4mips_metadata=non_input4mips_metadata,
        )

    else:
        res = Input4MIPsDataset(
            data=ds_stripped,
            metadata=metadata,
            non_input4mips_metadata=non_input4mips_metadata,
            cvs=cvs,
        )

    return res

`get_out_path_and_disk_ready_dataset(root_data_dir, pint_dequantify_format='cf', frequency_metadata_keys=FrequencyMetadataKeys(), time_dimension='time')` #

Get path in which to write and a disk-ready dataset

Parameters:

Name	Type	Description	Default
`root_data_dir`	`Path`	Root directory in which to write the file	required
`pint_dequantify_format`	`str`	Format to use when dequantifying variables with Pint. It is unlikely that you will want to change this.	`'cf'`
`frequency_metadata_keys`	`FrequencyMetadataKeys`	Metadata definitions for frequency information	`FrequencyMetadataKeys()`
`time_dimension`	`str`	The time dimension of the data. Required so that we know what information to pass to the path generating algorithm, in case the path generating algorithm requires time axis information.	`'time'`

Returns:

Type	Description
`tuple[Path, Dataset]`	Path in which to write the file and the iris.cube.Cube's to write in the file.

Notes

You will generally not want to write the output of this directly to disk, because it will not be CF-compliant. To see how to write CF-compliant files, see write.

`write(root_data_dir, pint_dequantify_format='cf', unlimited_dimensions=('time'), frequency_metadata_keys=FrequencyMetadataKeys(), time_dimension='time', xr_variable_processor=XRVariableHelper(), bounds_info=None)` #

Write to disk

This takes a very opionated view of how to write to disk. If you need to alter this, please take the source code of this method as a template then alter as required.

Parameters:

Name	Type	Description	Default
`root_data_dir`	`Path`	Root directory in which to write the file	required
`pint_dequantify_format`	`str`	Format to use when dequantifying variables with Pint. It is unlikely that you will want to change this. If you are not using pint for unit handling, this will be ignored.	`'cf'`
`unlimited_dimensions`	`tuple[str, ...]`	Dimensions which should be unlimited in the written file This is passed to iris.save.	`('time')`
`frequency_metadata_keys`	`FrequencyMetadataKeys`	Metadata definitions for frequency information	`FrequencyMetadataKeys()`
`time_dimension`	`str`	The time dimension of the data. Required so that we know what information to pass to the path generating algorithm, in case the path generating algorithm requires time axis information.	`'time'`
`xr_variable_processor`	`XRVariableProcessorLike`	Helper to use for processing the variables in xarray objects.	`XRVariableHelper()`
`bounds_info`	`BoundsInfo \| None`	Metadata definitions for bounds handling If `None`, this will be inferred from `ds`.	`None`

Returns:

Type	Description
`Path`	Path in which the file was written

Source code in src/input4mips_validation/dataset/dataset.py

def write(  # noqa: PLR0913
    self,
    root_data_dir: Path,
    pint_dequantify_format: str = "cf",
    unlimited_dimensions: tuple[str, ...] = ("time",),
    frequency_metadata_keys: FrequencyMetadataKeys = FrequencyMetadataKeys(),
    time_dimension: str = "time",
    xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
    bounds_info: BoundsInfo | None = None,
) -> Path:
    """
    Write to disk

    This takes a very opionated view of how to write to disk.
    If you need to alter this, please take the source code of this method
    as a template then alter as required.

    Parameters
    ----------
    root_data_dir
        Root directory in which to write the file

    pint_dequantify_format
        Format to use when dequantifying variables with Pint.

        It is unlikely that you will want to change this.
        If you are not using pint for unit handling, this will be ignored.

    unlimited_dimensions
        Dimensions which should be unlimited in the written file

        This is passed to [iris.save][].

    frequency_metadata_keys
        Metadata definitions for frequency information

    time_dimension
        The time dimension of the data.

        Required so that we know
        what information to pass to the path generating algorithm,
        in case the path generating algorithm requires time axis information.

    xr_variable_processor
        Helper to use for processing the variables in xarray objects.

    bounds_info
        Metadata definitions for bounds handling

        If `None`, this will be inferred from `ds`.

    Returns
    -------
    :
        Path in which the file was written
    """
    out_path, ds_disk_ready = self.get_out_path_and_disk_ready_dataset(
        root_data_dir=root_data_dir,
        pint_dequantify_format=pint_dequantify_format,
        frequency_metadata_keys=frequency_metadata_keys,
        time_dimension=time_dimension,
    )

    # Validate
    # As part of https://github.com/climate-resource/input4mips_validation/issues/14
    # add final validation here for bullet proofness
    # - tracking ID, creation date, comparison with DRS from cvs etc.
    validation_result = get_ds_to_write_to_disk_validation_result(
        ds=ds_disk_ready,
        out_path=out_path,
        cvs=self.cvs,
        xr_variable_processor=xr_variable_processor,
        frequency_metadata_keys=frequency_metadata_keys,
        bounds_info=bounds_info,
    )
    validation_result.raise_if_errors()

    # Convert to cubes with ncdata
    cubes = ncdata.iris_xarray.cubes_from_xarray(ds_disk_ready)

    # Having validated and converted to cubes, make the target directory.
    out_path.parent.mkdir(parents=True, exist_ok=True)

    # Write the file to disk
    iris.save(
        cubes,
        out_path,
        unlimited_dimensions=unlimited_dimensions,
    )

    return out_path

`Input4MIPsDatasetMetadata` #

Metadata for an input4MIPs dataset

Source code in src/input4mips_validation/dataset/metadata.py

@frozen
class Input4MIPsDatasetMetadata:
    """
    Metadata for an input4MIPs dataset
    """

    activity_id: str
    """Activity ID that applies to the file"""

    contact: str
    """Email addresses to contact in case of questions about the file"""

    dataset_category: str
    """The file's category"""

    frequency: str
    """Frequency of the data in the file"""

    further_info_url: str
    """URL where further information about the file/data in the file can be found"""

    grid_label: str = field()
    """
    Label that identfies the file's grid

    [TODO: cross-ref to the CVs]
    """

    institution_id: str
    """ID of the institute that created the file"""

    license: str
    """License information for the dataset"""

    mip_era: str
    """The MIP era to which this file belong"""

    nominal_resolution: str
    """Nominal resolution of the data in the file"""

    realm: str
    """The realm of the data in the file"""

    source_id: str
    """The ID of the file's source"""

    source_version: str
    """The version of the file, as defined by the source"""

    target_mip: str
    """The MIP that this file targets"""

    variable_id: str
    """The ID of the variable contained in the file"""

    comment: Union[str, None] = None
    """
    Comments that apply to the file

    These are the comments included in the file itself.
    As a result, they can only apply to the file at the time of writing.
    For comments made about the file after the fact,
    e.g. reasons for deprecation,
    see `comment_post_publication`.
    """

    doi: Union[str, None] = None
    """The digital object identifier (DOI) associated with the file."""

    institution: Union[str, None] = None
    """Long-form description of the institute referred to by `institution_id`"""

    license_id: Union[str, None] = None
    """ID of the license that applies to this dataset"""

    product: Union[str, None] = None
    """The kind of data in the file"""

    region: Union[str, None] = None
    """The region of the data in the file"""

    source: Union[str, None] = None
    """Long-form description of the source referred to by `source_id`"""

`activity_id: str` `instance-attribute` #

Activity ID that applies to the file

`comment: Union[str, None] = None` `class-attribute` `instance-attribute` #

Comments that apply to the file

These are the comments included in the file itself. As a result, they can only apply to the file at the time of writing. For comments made about the file after the fact, e.g. reasons for deprecation, see comment_post_publication.

`contact: str` `instance-attribute` #

Email addresses to contact in case of questions about the file

`dataset_category: str` `instance-attribute` #

The file's category

`doi: Union[str, None] = None` `class-attribute` `instance-attribute` #

The digital object identifier (DOI) associated with the file.

`frequency: str` `instance-attribute` #

Frequency of the data in the file

`further_info_url: str` `instance-attribute` #

URL where further information about the file/data in the file can be found

`grid_label: str = field()` `class-attribute` `instance-attribute` #

Label that identfies the file's grid

[TODO: cross-ref to the CVs]

`institution: Union[str, None] = None` `class-attribute` `instance-attribute` #

Long-form description of the institute referred to by institution_id

`institution_id: str` `instance-attribute` #

ID of the institute that created the file

`license: str` `instance-attribute` #

License information for the dataset

`license_id: Union[str, None] = None` `class-attribute` `instance-attribute` #

ID of the license that applies to this dataset

`mip_era: str` `instance-attribute` #

The MIP era to which this file belong

`nominal_resolution: str` `instance-attribute` #

Nominal resolution of the data in the file

`product: Union[str, None] = None` `class-attribute` `instance-attribute` #

The kind of data in the file

`realm: str` `instance-attribute` #

The realm of the data in the file

`region: Union[str, None] = None` `class-attribute` `instance-attribute` #

The region of the data in the file

`source: Union[str, None] = None` `class-attribute` `instance-attribute` #

Long-form description of the source referred to by source_id

`source_id: str` `instance-attribute` #

The ID of the file's source

`source_version: str` `instance-attribute` #

The version of the file, as defined by the source

`target_mip: str` `instance-attribute` #

The MIP that this file targets

`variable_id: str` `instance-attribute` #

The ID of the variable contained in the file

`Input4MIPsDatasetMetadataDataProducerMinimum` #

Minimum metadata required from an input4MIPs dataset producer

This is the minimum metadata required to create a valid Input4MIPsDataset object using from_data_producer_minimum_information.

Source code in src/input4mips_validation/dataset/metadata_data_producer_minimum.py

@frozen
class Input4MIPsDatasetMetadataDataProducerMinimum:
    """
    Minimum metadata required from an input4MIPs dataset producer

    This is the minimum metadata required to create a valid
    [`Input4MIPsDataset`][input4mips_validation.dataset.Input4MIPsDataset] object using
    [`from_data_producer_minimum_information`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information].
    """

    grid_label: str = field()
    """
    Label that identfies the file's grid

    [TODO: cross-ref to the CVs]
    """

    nominal_resolution: str
    """Nominal resolution of the data in the file"""

    source_id: str
    """The ID of the file's source"""

    target_mip: str
    """The MIP that this file targets"""

`grid_label: str = field()` `class-attribute` `instance-attribute` #

Label that identfies the file's grid

[TODO: cross-ref to the CVs]

`nominal_resolution: str` `instance-attribute` #

Nominal resolution of the data in the file

`source_id: str` `instance-attribute` #

The ID of the file's source

`target_mip: str` `instance-attribute` #

The MIP that this file targets

`Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum` #

Minimum metadata required from input4MIPs dataset producer for a multi-variable file

This is the minimum metadata required to create a valid Input4MIPsDataset object using from_data_producer_minimum_information_multiple_variable.

Source code in src/input4mips_validation/dataset/metadata_data_producer_multiple_variable_minimum.py

@frozen
class Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum:
    """
    Minimum metadata required from input4MIPs dataset producer for a multi-variable file

    This is the minimum metadata required to create a valid
    [`Input4MIPsDataset`][input4mips_validation.dataset.Input4MIPsDataset] object using
    [`from_data_producer_minimum_information_multiple_variable`][input4mips_validation.dataset.dataset.Input4MIPsDataset.from_data_producer_minimum_information_multiple_variable].
    """

    grid_label: str = field()
    """
    Label that identfies the file's grid

    [TODO: cross-ref to the CVs]
    """

    nominal_resolution: str
    """Nominal resolution of the data in the file"""

    source_id: str
    """The ID of the file's source"""

    target_mip: str
    """The MIP that this file targets"""

    dataset_category: str
    """The file's category"""

    realm: str
    """The realm of the data in the file"""

`dataset_category: str` `instance-attribute` #

The file's category

`grid_label: str = field()` `class-attribute` `instance-attribute` #

Label that identfies the file's grid

[TODO: cross-ref to the CVs]

`nominal_resolution: str` `instance-attribute` #

Nominal resolution of the data in the file

`realm: str` `instance-attribute` #

The realm of the data in the file

`source_id: str` `instance-attribute` #

The ID of the file's source

`target_mip: str` `instance-attribute` #

The MIP that this file targets

input4mips_validation.dataset#