Skip to content

input4mips_validation.dataset#

Sub-package Description
dataset Dataset class definition
metadata Metadata for Input4MIPsDataset objects
metadata_data_producer_minimum Minimum metadata required from an input4MIPs dataset producer
metadata_data_producer_multiple_variable_minimum Minimum metadata required from an input4MIPs dataset producer for a multi-variable file

input4mips_validation.dataset #

Classes that define an input4MIPs dataset and associated metadata

Input4MIPsDataset #

Representation of an input4MIPs dataset

For validation, see [TODO: validate_input4mips_ds function and then cross-ref here].

Source code in src/input4mips_validation/dataset/dataset.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
@frozen
class Input4MIPsDataset:
    """
    Representation of an input4MIPs dataset

    For validation, see
    [TODO: `validate_input4mips_ds` function and then cross-ref here].
    """

    data: xr.Dataset
    """
    Data
    """

    metadata: Input4MIPsDatasetMetadata
    """
    Metadata
    """

    cvs: Input4MIPsCVs = field()
    """
    Controlled vocabularies to use with this dataset

    If not supplied, we create these with
    [`load_cvs`][input4mips_validation.cvs.loading.load_cvs]
    """

    non_input4mips_metadata: Optional[dict[str, str]] = field(default=None)
    """
    Metadata that isn't part of input4MIPs' data model
    This will simply be written as attributes to the file,
    as long as it doesn't clash with any of the input4MIPs keys.
    """

    @non_input4mips_metadata.validator
    def _no_clash_with_metadata_attributes(
        self, attribute: attr.Attribute[Any], value: dict[str, Any] | None
    ) -> None:
        if value is None:
            return

        clashing_keys = [key for key in value if key in asdict(self.metadata).keys()]
        if clashing_keys:
            msg = (
                f"{attribute.name} must not contain any keys "
                "that clash with the `self.metadata`. "
                f"Keys in both {attribute.name} and `self.metadata`: {clashing_keys}"
            )
            raise AssertionError(msg)

    @cvs.default
    def _load_default_cvs(self) -> Input4MIPsCVs:
        return load_cvs()

    @classmethod
    def from_ds(
        cls,
        ds: xr.Dataset,
        cvs: Input4MIPsCVs | None,
    ) -> Input4MIPsDataset:
        """
        Initialise from an existing dataset

        Parameters
        ----------
        ds
            Dataset from which to initialise.
            We infer the metdata from `ds.attrs`.

        cvs
            Controlled vocabularies to use with the dataset

        Returns
        -------
            Initialised instance
        """
        ds_stripped = ds.copy()
        ds_stripped.attrs = {}

        metadata_fields = [
            f.name for f in fields(Input4MIPsDatasetMetadata) if f.name in ds.attrs
        ]
        metadata = Input4MIPsDatasetMetadata(
            **{k: ds.attrs[k] for k in metadata_fields}
        )
        non_input4mips_metadata = {
            k: v for k, v in ds.attrs.items() if k not in metadata_fields
        }

        if cvs is None:
            res = Input4MIPsDataset(
                data=ds_stripped,
                metadata=metadata,
                non_input4mips_metadata=non_input4mips_metadata,
            )

        else:
            res = Input4MIPsDataset(
                data=ds_stripped,
                metadata=metadata,
                non_input4mips_metadata=non_input4mips_metadata,
                cvs=cvs,
            )

        return res

    @classmethod
    def from_data_producer_minimum_information(  # noqa: PLR0913
        cls,
        data: xr.Dataset,
        metadata_minimum: Input4MIPsDatasetMetadataDataProducerMinimum,
        cvs: Input4MIPsCVs | None = None,
        prepare_func: PrepareFuncLike | None = None,
        copy_ds: bool = True,
        activity_id: str = "input4MIPs",
        dataset_category: str | None = None,
        realm: str | None = None,
        xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
    ) -> Input4MIPsDataset:
        """
        Initialise from the minimum information required from the data producer

        This applies to dataset's that have a single variable.
        For multi-variable datasets, see
        [`from_data_producer_minimum_information_multiple_variable`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information_multiple_variable].

        Parameters
        ----------
        data
            Raw data

        metadata_minimum
            Minimum metadata required from the data producer

        cvs
            CVs to use for inference and validation

            If not supplied, this will be retrieved with
            [`load_cvs`][input4mips_validation.cvs.load_cvs]

        prepare_func
            Function to use to prepare the data, retrieve source ID values from the CVs
            and infer the frequency metadata.

            If not supplied, we use
            [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

        copy_ds
            Should `ds` be copied before we create the `Input4MIPsDataset`?

        activity_id
            Activity ID that applies to the dataset.

            Given this is an Input4MIPsDataset, you shouldn't need to change this.

        dataset_category
            The category of the data.

            If not supplied, we will try and infer this based on
            [`VARIABLE_DATASET_CATEGORY_MAP`][input4mips_validation.inference.from_data.VARIABLE_DATASET_CATEGORY_MAP].

        realm
            The realm of the data.

            If not supplied, we will try and infer this based on
            [`VARIABLE_REALM_MAP`][input4mips_validation.inference.from_data.VARIABLE_REALM_MAP].

        xr_variable_processor
            Helper to use for processing the variables in xarray objects.

        Returns
        -------
        :
            Initialised instance
        """
        variable_id = get_ds_var_assert_single(
            data, xr_variable_processor=xr_variable_processor
        )

        ### These lines are exactly the same as in
        # `from_data_producer_minimum_information_multiple_variable`.
        # This is on purpose, the extra layer of abstraction
        # and coupling isn't worth it right now.
        if cvs is None:
            cvs = load_cvs()

        if prepare_func is None:
            prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
        else:
            prepare_func_use = prepare_func

        if copy_ds:
            data = data.copy()

        data, frequency = prepare_func_use(
            ds_raw=data,
            # Copying handled above
            copy_ds=False,
        )

        # [TODO: remove this once we are confident in our license checks]
        cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
        cvs_source_id_values = cvs_source_id_entry.values
        if cvs_source_id_values.license_id is None:
            msg = "License ID must be specified in the CVs source ID"
            raise AssertionError(msg)
        ### End of identical lines

        if dataset_category is None:
            dataset_category = VARIABLE_DATASET_CATEGORY_MAP[variable_id]

        if realm is None:
            realm = VARIABLE_REALM_MAP[variable_id]

        metadata = Input4MIPsDatasetMetadata(
            activity_id=activity_id,
            contact=cvs_source_id_values.contact,
            dataset_category=dataset_category,
            frequency=frequency,
            further_info_url=cvs_source_id_values.further_info_url,
            grid_label=metadata_minimum.grid_label,
            # # TODO: look this up from central CVs
            # institution=cvs_source_id_values.institution,
            institution_id=cvs_source_id_values.institution_id,
            license=cvs.license_entries[
                cvs_source_id_values.license_id
            ].values.conditions,
            license_id=cvs_source_id_values.license_id,
            mip_era=cvs_source_id_values.mip_era,
            nominal_resolution=metadata_minimum.nominal_resolution,
            realm=realm,
            source_id=metadata_minimum.source_id,
            source_version=cvs_source_id_values.source_version,
            target_mip=metadata_minimum.target_mip,
            variable_id=variable_id,
        )

        return cls(data=data, metadata=metadata, cvs=cvs)

    @classmethod
    def from_data_producer_minimum_information_multiple_variable(  # noqa: PLR0913
        cls,
        data: xr.Dataset,
        metadata_minimum: Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum,
        cvs: Input4MIPsCVs | None = None,
        prepare_func: PrepareFuncLike | None = None,
        copy_ds: bool = True,
        activity_id: str = "input4MIPs",
        variable_id: str = "multiple",
    ) -> Input4MIPsDataset:
        """
        Initialise from the minimum information required from the data producer

        This applies to dataset's that have multiple variables.
        For single variable datasets, see
        [`from_data_producer_minimum_information`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information].

        Parameters
        ----------
        data
            Raw data

        metadata_minimum
            Minimum metadata required from the data producer

        cvs
            CVs to use for inference and validation

            If not supplied, this will be retrieved with
            [`load_cvs`][input4mips_validation.cvs.loading.load_cvs].

        prepare_func
            Function to use to prepare the data, retrieve source ID values from the CVs
            and infer the frequency metadata.

            If not supplied, we use
            [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

        copy_ds
            Should `ds` be copied before we create the `Input4MIPsDataset`?

        activity_id
            Activity ID that applies to the dataset.

            Given this is an Input4MIPsDataset, you shouldn't need to change this.

        variable_id
            The variable ID to use.

            For multi-variable datasets, as far as we are aware,
            this is always "multiple", hence you shouldn't need to change the defaults.

        Returns
        -------
        :
            Initialised instance
        """
        ### These lines are exactly the same as in
        # `from_data_producer_minimum_information`.
        # This is on purpose, the extra layer of abstraction
        # and coupling isn't worth it right now.
        if cvs is None:
            cvs = load_cvs()

        if prepare_func is None:
            prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
        else:
            prepare_func_use = prepare_func

        if copy_ds:
            data = data.copy()

        data, frequency = prepare_func_use(
            ds_raw=data,
            # Copying handled above
            copy_ds=False,
        )

        # [TODO: remove this once we are confident in our license checks]
        cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
        cvs_source_id_values = cvs_source_id_entry.values
        if cvs_source_id_values.license_id is None:
            msg = "License ID must be specified in the CVs source ID"
            raise AssertionError(msg)
        ### End of identical lines

        metadata = Input4MIPsDatasetMetadata(
            activity_id=activity_id,
            contact=cvs_source_id_values.contact,
            dataset_category=metadata_minimum.dataset_category,
            frequency=frequency,
            further_info_url=cvs_source_id_values.further_info_url,
            grid_label=metadata_minimum.grid_label,
            # # TODO: look this up from central CVs
            # institution=cvs_values.institution,
            institution_id=cvs_source_id_values.institution_id,
            license=cvs.license_entries[
                cvs_source_id_values.license_id
            ].values.conditions,
            license_id=cvs_source_id_values.license_id,
            mip_era=cvs_source_id_values.mip_era,
            nominal_resolution=metadata_minimum.nominal_resolution,
            realm=metadata_minimum.realm,
            source_id=metadata_minimum.source_id,
            source_version=cvs_source_id_values.source_version,
            target_mip=metadata_minimum.target_mip,
            variable_id=variable_id,
        )

        return cls(data=data, metadata=metadata, cvs=cvs)

    def get_out_path_and_disk_ready_dataset(
        self,
        root_data_dir: Path,
        pint_dequantify_format: str = "cf",
        frequency_metadata_keys: FrequencyMetadataKeys = FrequencyMetadataKeys(),
        time_dimension: str = "time",
    ) -> tuple[Path, xr.Dataset]:
        """
        Get path in which to write and a disk-ready dataset

        Parameters
        ----------
        root_data_dir
            Root directory in which to write the file

        pint_dequantify_format
            Format to use when dequantifying variables with Pint.

            It is unlikely that you will want to change this.

        frequency_metadata_keys
            Metadata definitions for frequency information

        time_dimension
            The time dimension of the data.

            Required so that we know
            what information to pass to the path generating algorithm,
            in case the path generating algorithm requires time axis information.

        Returns
        -------
        :
            Path in which to write the file
            and the [iris.cube.Cube][]'s to write in the file.

        Notes
        -----
        You will generally not want to write the output of this directly to disk,
        because it will not be CF-compliant.
        To see how to write CF-compliant files,
        see [`write`][input4mips_validation.dataset.Input4MIPsDataset.write].

        See Also
        --------
        [`write`][input4mips_validation.dataset.Input4MIPsDataset.write]
        """
        cvs = self.cvs

        # Can shallow copy as we don't alter the data from here on
        ds_disk = self.data.copy(deep=False)
        try:
            ds_disk = ds_disk.pint.dequantify(format=pint_dequantify_format)
        except AttributeError:
            logger.debug(
                "Not dequantifying with pint, "
                "I assume you know what you're doing with units"
            )

        # Add all the metadata
        ds_disk.attrs = convert_input4mips_metadata_to_ds_attrs(self.metadata)
        if self.non_input4mips_metadata is not None:
            # Merge the metadata.
            # Validation ensures that there will be no clash of keys.
            ds_disk.attrs = (
                self.non_input4mips_metadata
                | convert_input4mips_metadata_to_ds_attrs(self.metadata)
            )

        else:
            ds_disk.attrs = convert_input4mips_metadata_to_ds_attrs(self.metadata)

        # Must be unique for every written file,
        # so we deliberately don't provide a way
        # for the user to overwrite this at present
        # and we deliberately overwrite any existing values.
        ds_disk.attrs["tracking_id"] = generate_tracking_id()
        ds_disk.attrs["creation_date"] = generate_creation_timestamp()

        time_start, time_end = infer_time_start_time_end_for_filename(
            ds=ds_disk,
            frequency_metadata_key=frequency_metadata_keys.frequency_metadata_key,
            no_time_axis_frequency=frequency_metadata_keys.no_time_axis_frequency,
            time_dimension=time_dimension,
        )

        out_path = cvs.DRS.get_file_path(
            root_data_dir=root_data_dir,
            available_attributes=ds_disk.attrs,
            time_start=time_start,
            time_end=time_end,
        )

        return out_path, ds_disk

    def write(  # noqa: PLR0913
        self,
        root_data_dir: Path,
        pint_dequantify_format: str = "cf",
        unlimited_dimensions: tuple[str, ...] = ("time",),
        frequency_metadata_keys: FrequencyMetadataKeys = FrequencyMetadataKeys(),
        time_dimension: str = "time",
        xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
        bounds_info: BoundsInfo | None = None,
    ) -> Path:
        """
        Write to disk

        This takes a very opionated view of how to write to disk.
        If you need to alter this, please take the source code of this method
        as a template then alter as required.

        Parameters
        ----------
        root_data_dir
            Root directory in which to write the file

        pint_dequantify_format
            Format to use when dequantifying variables with Pint.

            It is unlikely that you will want to change this.
            If you are not using pint for unit handling, this will be ignored.

        unlimited_dimensions
            Dimensions which should be unlimited in the written file

            This is passed to [iris.save][].

        frequency_metadata_keys
            Metadata definitions for frequency information

        time_dimension
            The time dimension of the data.

            Required so that we know
            what information to pass to the path generating algorithm,
            in case the path generating algorithm requires time axis information.

        xr_variable_processor
            Helper to use for processing the variables in xarray objects.

        bounds_info
            Metadata definitions for bounds handling

            If `None`, this will be inferred from `ds`.

        Returns
        -------
        :
            Path in which the file was written
        """
        out_path, ds_disk_ready = self.get_out_path_and_disk_ready_dataset(
            root_data_dir=root_data_dir,
            pint_dequantify_format=pint_dequantify_format,
            frequency_metadata_keys=frequency_metadata_keys,
            time_dimension=time_dimension,
        )

        # Validate
        # As part of https://github.com/climate-resource/input4mips_validation/issues/14
        # add final validation here for bullet proofness
        # - tracking ID, creation date, comparison with DRS from cvs etc.
        validation_result = get_ds_to_write_to_disk_validation_result(
            ds=ds_disk_ready,
            out_path=out_path,
            cvs=self.cvs,
            xr_variable_processor=xr_variable_processor,
            frequency_metadata_keys=frequency_metadata_keys,
            bounds_info=bounds_info,
        )
        validation_result.raise_if_errors()

        # Convert to cubes with ncdata
        cubes = ncdata.iris_xarray.cubes_from_xarray(ds_disk_ready)

        # Having validated and converted to cubes, make the target directory.
        out_path.parent.mkdir(parents=True, exist_ok=True)

        # Write the file to disk
        iris.save(
            cubes,
            out_path,
            unlimited_dimensions=unlimited_dimensions,
        )

        return out_path

cvs: Input4MIPsCVs = field() class-attribute instance-attribute #

Controlled vocabularies to use with this dataset

If not supplied, we create these with load_cvs

data: xr.Dataset instance-attribute #

Data

metadata: Input4MIPsDatasetMetadata instance-attribute #

Metadata

non_input4mips_metadata: Optional[dict[str, str]] = field(default=None) class-attribute instance-attribute #

Metadata that isn't part of input4MIPs' data model This will simply be written as attributes to the file, as long as it doesn't clash with any of the input4MIPs keys.

from_data_producer_minimum_information(data, metadata_minimum, cvs=None, prepare_func=None, copy_ds=True, activity_id='input4MIPs', dataset_category=None, realm=None, xr_variable_processor=XRVariableHelper()) classmethod #

Initialise from the minimum information required from the data producer

This applies to dataset's that have a single variable. For multi-variable datasets, see from_data_producer_minimum_information_multiple_variable.

Parameters:

Name Type Description Default
data Dataset

Raw data

required
metadata_minimum Input4MIPsDatasetMetadataDataProducerMinimum

Minimum metadata required from the data producer

required
cvs Input4MIPsCVs | None

CVs to use for inference and validation

If not supplied, this will be retrieved with load_cvs

None
prepare_func PrepareFuncLike | None

Function to use to prepare the data, retrieve source ID values from the CVs and infer the frequency metadata.

If not supplied, we use [input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency].

None
copy_ds bool

Should ds be copied before we create the Input4MIPsDataset?

True
activity_id str

Activity ID that applies to the dataset.

Given this is an Input4MIPsDataset, you shouldn't need to change this.

'input4MIPs'
dataset_category str | None

The category of the data.

If not supplied, we will try and infer this based on VARIABLE_DATASET_CATEGORY_MAP.

None
realm str | None

The realm of the data.

If not supplied, we will try and infer this based on VARIABLE_REALM_MAP.

None
xr_variable_processor XRVariableProcessorLike

Helper to use for processing the variables in xarray objects.

XRVariableHelper()

Returns:

Type Description
Input4MIPsDataset

Initialised instance

Source code in src/input4mips_validation/dataset/dataset.py
@classmethod
def from_data_producer_minimum_information(  # noqa: PLR0913
    cls,
    data: xr.Dataset,
    metadata_minimum: Input4MIPsDatasetMetadataDataProducerMinimum,
    cvs: Input4MIPsCVs | None = None,
    prepare_func: PrepareFuncLike | None = None,
    copy_ds: bool = True,
    activity_id: str = "input4MIPs",
    dataset_category: str | None = None,
    realm: str | None = None,
    xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
) -> Input4MIPsDataset:
    """
    Initialise from the minimum information required from the data producer

    This applies to dataset's that have a single variable.
    For multi-variable datasets, see
    [`from_data_producer_minimum_information_multiple_variable`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information_multiple_variable].

    Parameters
    ----------
    data
        Raw data

    metadata_minimum
        Minimum metadata required from the data producer

    cvs
        CVs to use for inference and validation

        If not supplied, this will be retrieved with
        [`load_cvs`][input4mips_validation.cvs.load_cvs]

    prepare_func
        Function to use to prepare the data, retrieve source ID values from the CVs
        and infer the frequency metadata.

        If not supplied, we use
        [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

    copy_ds
        Should `ds` be copied before we create the `Input4MIPsDataset`?

    activity_id
        Activity ID that applies to the dataset.

        Given this is an Input4MIPsDataset, you shouldn't need to change this.

    dataset_category
        The category of the data.

        If not supplied, we will try and infer this based on
        [`VARIABLE_DATASET_CATEGORY_MAP`][input4mips_validation.inference.from_data.VARIABLE_DATASET_CATEGORY_MAP].

    realm
        The realm of the data.

        If not supplied, we will try and infer this based on
        [`VARIABLE_REALM_MAP`][input4mips_validation.inference.from_data.VARIABLE_REALM_MAP].

    xr_variable_processor
        Helper to use for processing the variables in xarray objects.

    Returns
    -------
    :
        Initialised instance
    """
    variable_id = get_ds_var_assert_single(
        data, xr_variable_processor=xr_variable_processor
    )

    ### These lines are exactly the same as in
    # `from_data_producer_minimum_information_multiple_variable`.
    # This is on purpose, the extra layer of abstraction
    # and coupling isn't worth it right now.
    if cvs is None:
        cvs = load_cvs()

    if prepare_func is None:
        prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
    else:
        prepare_func_use = prepare_func

    if copy_ds:
        data = data.copy()

    data, frequency = prepare_func_use(
        ds_raw=data,
        # Copying handled above
        copy_ds=False,
    )

    # [TODO: remove this once we are confident in our license checks]
    cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
    cvs_source_id_values = cvs_source_id_entry.values
    if cvs_source_id_values.license_id is None:
        msg = "License ID must be specified in the CVs source ID"
        raise AssertionError(msg)
    ### End of identical lines

    if dataset_category is None:
        dataset_category = VARIABLE_DATASET_CATEGORY_MAP[variable_id]

    if realm is None:
        realm = VARIABLE_REALM_MAP[variable_id]

    metadata = Input4MIPsDatasetMetadata(
        activity_id=activity_id,
        contact=cvs_source_id_values.contact,
        dataset_category=dataset_category,
        frequency=frequency,
        further_info_url=cvs_source_id_values.further_info_url,
        grid_label=metadata_minimum.grid_label,
        # # TODO: look this up from central CVs
        # institution=cvs_source_id_values.institution,
        institution_id=cvs_source_id_values.institution_id,
        license=cvs.license_entries[
            cvs_source_id_values.license_id
        ].values.conditions,
        license_id=cvs_source_id_values.license_id,
        mip_era=cvs_source_id_values.mip_era,
        nominal_resolution=metadata_minimum.nominal_resolution,
        realm=realm,
        source_id=metadata_minimum.source_id,
        source_version=cvs_source_id_values.source_version,
        target_mip=metadata_minimum.target_mip,
        variable_id=variable_id,
    )

    return cls(data=data, metadata=metadata, cvs=cvs)

from_data_producer_minimum_information_multiple_variable(data, metadata_minimum, cvs=None, prepare_func=None, copy_ds=True, activity_id='input4MIPs', variable_id='multiple') classmethod #

Initialise from the minimum information required from the data producer

This applies to dataset's that have multiple variables. For single variable datasets, see from_data_producer_minimum_information.

Parameters:

Name Type Description Default
data Dataset

Raw data

required
metadata_minimum Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum

Minimum metadata required from the data producer

required
cvs Input4MIPsCVs | None

CVs to use for inference and validation

If not supplied, this will be retrieved with load_cvs.

None
prepare_func PrepareFuncLike | None

Function to use to prepare the data, retrieve source ID values from the CVs and infer the frequency metadata.

If not supplied, we use [input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency].

None
copy_ds bool

Should ds be copied before we create the Input4MIPsDataset?

True
activity_id str

Activity ID that applies to the dataset.

Given this is an Input4MIPsDataset, you shouldn't need to change this.

'input4MIPs'
variable_id str

The variable ID to use.

For multi-variable datasets, as far as we are aware, this is always "multiple", hence you shouldn't need to change the defaults.

'multiple'

Returns:

Type Description
Input4MIPsDataset

Initialised instance

Source code in src/input4mips_validation/dataset/dataset.py
@classmethod
def from_data_producer_minimum_information_multiple_variable(  # noqa: PLR0913
    cls,
    data: xr.Dataset,
    metadata_minimum: Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum,
    cvs: Input4MIPsCVs | None = None,
    prepare_func: PrepareFuncLike | None = None,
    copy_ds: bool = True,
    activity_id: str = "input4MIPs",
    variable_id: str = "multiple",
) -> Input4MIPsDataset:
    """
    Initialise from the minimum information required from the data producer

    This applies to dataset's that have multiple variables.
    For single variable datasets, see
    [`from_data_producer_minimum_information`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information].

    Parameters
    ----------
    data
        Raw data

    metadata_minimum
        Minimum metadata required from the data producer

    cvs
        CVs to use for inference and validation

        If not supplied, this will be retrieved with
        [`load_cvs`][input4mips_validation.cvs.loading.load_cvs].

    prepare_func
        Function to use to prepare the data, retrieve source ID values from the CVs
        and infer the frequency metadata.

        If not supplied, we use
        [`input4mips_validation.dataset.dataset.prepare_ds_and_get_frequency`].

    copy_ds
        Should `ds` be copied before we create the `Input4MIPsDataset`?

    activity_id
        Activity ID that applies to the dataset.

        Given this is an Input4MIPsDataset, you shouldn't need to change this.

    variable_id
        The variable ID to use.

        For multi-variable datasets, as far as we are aware,
        this is always "multiple", hence you shouldn't need to change the defaults.

    Returns
    -------
    :
        Initialised instance
    """
    ### These lines are exactly the same as in
    # `from_data_producer_minimum_information`.
    # This is on purpose, the extra layer of abstraction
    # and coupling isn't worth it right now.
    if cvs is None:
        cvs = load_cvs()

    if prepare_func is None:
        prepare_func_use: PrepareFuncLike = prepare_ds_and_get_frequency  # type: ignore # can't get mypy to behave
    else:
        prepare_func_use = prepare_func

    if copy_ds:
        data = data.copy()

    data, frequency = prepare_func_use(
        ds_raw=data,
        # Copying handled above
        copy_ds=False,
    )

    # [TODO: remove this once we are confident in our license checks]
    cvs_source_id_entry = cvs.source_id_entries[metadata_minimum.source_id]
    cvs_source_id_values = cvs_source_id_entry.values
    if cvs_source_id_values.license_id is None:
        msg = "License ID must be specified in the CVs source ID"
        raise AssertionError(msg)
    ### End of identical lines

    metadata = Input4MIPsDatasetMetadata(
        activity_id=activity_id,
        contact=cvs_source_id_values.contact,
        dataset_category=metadata_minimum.dataset_category,
        frequency=frequency,
        further_info_url=cvs_source_id_values.further_info_url,
        grid_label=metadata_minimum.grid_label,
        # # TODO: look this up from central CVs
        # institution=cvs_values.institution,
        institution_id=cvs_source_id_values.institution_id,
        license=cvs.license_entries[
            cvs_source_id_values.license_id
        ].values.conditions,
        license_id=cvs_source_id_values.license_id,
        mip_era=cvs_source_id_values.mip_era,
        nominal_resolution=metadata_minimum.nominal_resolution,
        realm=metadata_minimum.realm,
        source_id=metadata_minimum.source_id,
        source_version=cvs_source_id_values.source_version,
        target_mip=metadata_minimum.target_mip,
        variable_id=variable_id,
    )

    return cls(data=data, metadata=metadata, cvs=cvs)

from_ds(ds, cvs) classmethod #

Initialise from an existing dataset

Parameters:

Name Type Description Default
ds Dataset

Dataset from which to initialise. We infer the metdata from ds.attrs.

required
cvs Input4MIPsCVs | None

Controlled vocabularies to use with the dataset

required

Returns:

Type Description
Initialised instance
Source code in src/input4mips_validation/dataset/dataset.py
@classmethod
def from_ds(
    cls,
    ds: xr.Dataset,
    cvs: Input4MIPsCVs | None,
) -> Input4MIPsDataset:
    """
    Initialise from an existing dataset

    Parameters
    ----------
    ds
        Dataset from which to initialise.
        We infer the metdata from `ds.attrs`.

    cvs
        Controlled vocabularies to use with the dataset

    Returns
    -------
        Initialised instance
    """
    ds_stripped = ds.copy()
    ds_stripped.attrs = {}

    metadata_fields = [
        f.name for f in fields(Input4MIPsDatasetMetadata) if f.name in ds.attrs
    ]
    metadata = Input4MIPsDatasetMetadata(
        **{k: ds.attrs[k] for k in metadata_fields}
    )
    non_input4mips_metadata = {
        k: v for k, v in ds.attrs.items() if k not in metadata_fields
    }

    if cvs is None:
        res = Input4MIPsDataset(
            data=ds_stripped,
            metadata=metadata,
            non_input4mips_metadata=non_input4mips_metadata,
        )

    else:
        res = Input4MIPsDataset(
            data=ds_stripped,
            metadata=metadata,
            non_input4mips_metadata=non_input4mips_metadata,
            cvs=cvs,
        )

    return res

get_out_path_and_disk_ready_dataset(root_data_dir, pint_dequantify_format='cf', frequency_metadata_keys=FrequencyMetadataKeys(), time_dimension='time') #

Get path in which to write and a disk-ready dataset

Parameters:

Name Type Description Default
root_data_dir Path

Root directory in which to write the file

required
pint_dequantify_format str

Format to use when dequantifying variables with Pint.

It is unlikely that you will want to change this.

'cf'
frequency_metadata_keys FrequencyMetadataKeys

Metadata definitions for frequency information

FrequencyMetadataKeys()
time_dimension str

The time dimension of the data.

Required so that we know what information to pass to the path generating algorithm, in case the path generating algorithm requires time axis information.

'time'

Returns:

Type Description
tuple[Path, Dataset]

Path in which to write the file and the iris.cube.Cube's to write in the file.

Notes

You will generally not want to write the output of this directly to disk, because it will not be CF-compliant. To see how to write CF-compliant files, see write.

See Also

write

Source code in src/input4mips_validation/dataset/dataset.py
def get_out_path_and_disk_ready_dataset(
    self,
    root_data_dir: Path,
    pint_dequantify_format: str = "cf",
    frequency_metadata_keys: FrequencyMetadataKeys = FrequencyMetadataKeys(),
    time_dimension: str = "time",
) -> tuple[Path, xr.Dataset]:
    """
    Get path in which to write and a disk-ready dataset

    Parameters
    ----------
    root_data_dir
        Root directory in which to write the file

    pint_dequantify_format
        Format to use when dequantifying variables with Pint.

        It is unlikely that you will want to change this.

    frequency_metadata_keys
        Metadata definitions for frequency information

    time_dimension
        The time dimension of the data.

        Required so that we know
        what information to pass to the path generating algorithm,
        in case the path generating algorithm requires time axis information.

    Returns
    -------
    :
        Path in which to write the file
        and the [iris.cube.Cube][]'s to write in the file.

    Notes
    -----
    You will generally not want to write the output of this directly to disk,
    because it will not be CF-compliant.
    To see how to write CF-compliant files,
    see [`write`][input4mips_validation.dataset.Input4MIPsDataset.write].

    See Also
    --------
    [`write`][input4mips_validation.dataset.Input4MIPsDataset.write]
    """
    cvs = self.cvs

    # Can shallow copy as we don't alter the data from here on
    ds_disk = self.data.copy(deep=False)
    try:
        ds_disk = ds_disk.pint.dequantify(format=pint_dequantify_format)
    except AttributeError:
        logger.debug(
            "Not dequantifying with pint, "
            "I assume you know what you're doing with units"
        )

    # Add all the metadata
    ds_disk.attrs = convert_input4mips_metadata_to_ds_attrs(self.metadata)
    if self.non_input4mips_metadata is not None:
        # Merge the metadata.
        # Validation ensures that there will be no clash of keys.
        ds_disk.attrs = (
            self.non_input4mips_metadata
            | convert_input4mips_metadata_to_ds_attrs(self.metadata)
        )

    else:
        ds_disk.attrs = convert_input4mips_metadata_to_ds_attrs(self.metadata)

    # Must be unique for every written file,
    # so we deliberately don't provide a way
    # for the user to overwrite this at present
    # and we deliberately overwrite any existing values.
    ds_disk.attrs["tracking_id"] = generate_tracking_id()
    ds_disk.attrs["creation_date"] = generate_creation_timestamp()

    time_start, time_end = infer_time_start_time_end_for_filename(
        ds=ds_disk,
        frequency_metadata_key=frequency_metadata_keys.frequency_metadata_key,
        no_time_axis_frequency=frequency_metadata_keys.no_time_axis_frequency,
        time_dimension=time_dimension,
    )

    out_path = cvs.DRS.get_file_path(
        root_data_dir=root_data_dir,
        available_attributes=ds_disk.attrs,
        time_start=time_start,
        time_end=time_end,
    )

    return out_path, ds_disk

write(root_data_dir, pint_dequantify_format='cf', unlimited_dimensions=('time'), frequency_metadata_keys=FrequencyMetadataKeys(), time_dimension='time', xr_variable_processor=XRVariableHelper(), bounds_info=None) #

Write to disk

This takes a very opionated view of how to write to disk. If you need to alter this, please take the source code of this method as a template then alter as required.

Parameters:

Name Type Description Default
root_data_dir Path

Root directory in which to write the file

required
pint_dequantify_format str

Format to use when dequantifying variables with Pint.

It is unlikely that you will want to change this. If you are not using pint for unit handling, this will be ignored.

'cf'
unlimited_dimensions tuple[str, ...]

Dimensions which should be unlimited in the written file

This is passed to iris.save.

('time')
frequency_metadata_keys FrequencyMetadataKeys

Metadata definitions for frequency information

FrequencyMetadataKeys()
time_dimension str

The time dimension of the data.

Required so that we know what information to pass to the path generating algorithm, in case the path generating algorithm requires time axis information.

'time'
xr_variable_processor XRVariableProcessorLike

Helper to use for processing the variables in xarray objects.

XRVariableHelper()
bounds_info BoundsInfo | None

Metadata definitions for bounds handling

If None, this will be inferred from ds.

None

Returns:

Type Description
Path

Path in which the file was written

Source code in src/input4mips_validation/dataset/dataset.py
def write(  # noqa: PLR0913
    self,
    root_data_dir: Path,
    pint_dequantify_format: str = "cf",
    unlimited_dimensions: tuple[str, ...] = ("time",),
    frequency_metadata_keys: FrequencyMetadataKeys = FrequencyMetadataKeys(),
    time_dimension: str = "time",
    xr_variable_processor: XRVariableProcessorLike = XRVariableHelper(),
    bounds_info: BoundsInfo | None = None,
) -> Path:
    """
    Write to disk

    This takes a very opionated view of how to write to disk.
    If you need to alter this, please take the source code of this method
    as a template then alter as required.

    Parameters
    ----------
    root_data_dir
        Root directory in which to write the file

    pint_dequantify_format
        Format to use when dequantifying variables with Pint.

        It is unlikely that you will want to change this.
        If you are not using pint for unit handling, this will be ignored.

    unlimited_dimensions
        Dimensions which should be unlimited in the written file

        This is passed to [iris.save][].

    frequency_metadata_keys
        Metadata definitions for frequency information

    time_dimension
        The time dimension of the data.

        Required so that we know
        what information to pass to the path generating algorithm,
        in case the path generating algorithm requires time axis information.

    xr_variable_processor
        Helper to use for processing the variables in xarray objects.

    bounds_info
        Metadata definitions for bounds handling

        If `None`, this will be inferred from `ds`.

    Returns
    -------
    :
        Path in which the file was written
    """
    out_path, ds_disk_ready = self.get_out_path_and_disk_ready_dataset(
        root_data_dir=root_data_dir,
        pint_dequantify_format=pint_dequantify_format,
        frequency_metadata_keys=frequency_metadata_keys,
        time_dimension=time_dimension,
    )

    # Validate
    # As part of https://github.com/climate-resource/input4mips_validation/issues/14
    # add final validation here for bullet proofness
    # - tracking ID, creation date, comparison with DRS from cvs etc.
    validation_result = get_ds_to_write_to_disk_validation_result(
        ds=ds_disk_ready,
        out_path=out_path,
        cvs=self.cvs,
        xr_variable_processor=xr_variable_processor,
        frequency_metadata_keys=frequency_metadata_keys,
        bounds_info=bounds_info,
    )
    validation_result.raise_if_errors()

    # Convert to cubes with ncdata
    cubes = ncdata.iris_xarray.cubes_from_xarray(ds_disk_ready)

    # Having validated and converted to cubes, make the target directory.
    out_path.parent.mkdir(parents=True, exist_ok=True)

    # Write the file to disk
    iris.save(
        cubes,
        out_path,
        unlimited_dimensions=unlimited_dimensions,
    )

    return out_path

Input4MIPsDatasetMetadata #

Metadata for an input4MIPs dataset

Source code in src/input4mips_validation/dataset/metadata.py
@frozen
class Input4MIPsDatasetMetadata:
    """
    Metadata for an input4MIPs dataset
    """

    activity_id: str
    """Activity ID that applies to the file"""

    contact: str
    """Email addresses to contact in case of questions about the file"""

    dataset_category: str
    """The file's category"""

    frequency: str
    """Frequency of the data in the file"""

    further_info_url: str
    """URL where further information about the file/data in the file can be found"""

    grid_label: str = field()
    """
    Label that identfies the file's grid

    [TODO: cross-ref to the CVs]
    """

    institution_id: str
    """ID of the institute that created the file"""

    license: str
    """License information for the dataset"""

    mip_era: str
    """The MIP era to which this file belong"""

    nominal_resolution: str
    """Nominal resolution of the data in the file"""

    realm: str
    """The realm of the data in the file"""

    source_id: str
    """The ID of the file's source"""

    source_version: str
    """The version of the file, as defined by the source"""

    target_mip: str
    """The MIP that this file targets"""

    variable_id: str
    """The ID of the variable contained in the file"""

    comment: Union[str, None] = None
    """
    Comments that apply to the file

    These are the comments included in the file itself.
    As a result, they can only apply to the file at the time of writing.
    For comments made about the file after the fact,
    e.g. reasons for deprecation,
    see `comment_post_publication`.
    """

    doi: Union[str, None] = None
    """The digital object identifier (DOI) associated with the file."""

    institution: Union[str, None] = None
    """Long-form description of the institute referred to by `institution_id`"""

    license_id: Union[str, None] = None
    """ID of the license that applies to this dataset"""

    product: Union[str, None] = None
    """The kind of data in the file"""

    region: Union[str, None] = None
    """The region of the data in the file"""

    source: Union[str, None] = None
    """Long-form description of the source referred to by `source_id`"""

activity_id: str instance-attribute #

Activity ID that applies to the file

comment: Union[str, None] = None class-attribute instance-attribute #

Comments that apply to the file

These are the comments included in the file itself. As a result, they can only apply to the file at the time of writing. For comments made about the file after the fact, e.g. reasons for deprecation, see comment_post_publication.

contact: str instance-attribute #

Email addresses to contact in case of questions about the file

dataset_category: str instance-attribute #

The file's category

doi: Union[str, None] = None class-attribute instance-attribute #

The digital object identifier (DOI) associated with the file.

frequency: str instance-attribute #

Frequency of the data in the file

further_info_url: str instance-attribute #

URL where further information about the file/data in the file can be found

grid_label: str = field() class-attribute instance-attribute #

Label that identfies the file's grid

[TODO: cross-ref to the CVs]

institution: Union[str, None] = None class-attribute instance-attribute #

Long-form description of the institute referred to by institution_id

institution_id: str instance-attribute #

ID of the institute that created the file

license: str instance-attribute #

License information for the dataset

license_id: Union[str, None] = None class-attribute instance-attribute #

ID of the license that applies to this dataset

mip_era: str instance-attribute #

The MIP era to which this file belong

nominal_resolution: str instance-attribute #

Nominal resolution of the data in the file

product: Union[str, None] = None class-attribute instance-attribute #

The kind of data in the file

realm: str instance-attribute #

The realm of the data in the file

region: Union[str, None] = None class-attribute instance-attribute #

The region of the data in the file

source: Union[str, None] = None class-attribute instance-attribute #

Long-form description of the source referred to by source_id

source_id: str instance-attribute #

The ID of the file's source

source_version: str instance-attribute #

The version of the file, as defined by the source

target_mip: str instance-attribute #

The MIP that this file targets

variable_id: str instance-attribute #

The ID of the variable contained in the file

Input4MIPsDatasetMetadataDataProducerMinimum #

Minimum metadata required from an input4MIPs dataset producer

This is the minimum metadata required to create a valid Input4MIPsDataset object using from_data_producer_minimum_information.

Source code in src/input4mips_validation/dataset/metadata_data_producer_minimum.py
@frozen
class Input4MIPsDatasetMetadataDataProducerMinimum:
    """
    Minimum metadata required from an input4MIPs dataset producer

    This is the minimum metadata required to create a valid
    [`Input4MIPsDataset`][input4mips_validation.dataset.Input4MIPsDataset] object using
    [`from_data_producer_minimum_information`][input4mips_validation.dataset.Input4MIPsDataset.from_data_producer_minimum_information].
    """

    grid_label: str = field()
    """
    Label that identfies the file's grid

    [TODO: cross-ref to the CVs]
    """

    nominal_resolution: str
    """Nominal resolution of the data in the file"""

    source_id: str
    """The ID of the file's source"""

    target_mip: str
    """The MIP that this file targets"""

grid_label: str = field() class-attribute instance-attribute #

Label that identfies the file's grid

[TODO: cross-ref to the CVs]

nominal_resolution: str instance-attribute #

Nominal resolution of the data in the file

source_id: str instance-attribute #

The ID of the file's source

target_mip: str instance-attribute #

The MIP that this file targets

Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum #

Minimum metadata required from input4MIPs dataset producer for a multi-variable file

This is the minimum metadata required to create a valid Input4MIPsDataset object using from_data_producer_minimum_information_multiple_variable.

Source code in src/input4mips_validation/dataset/metadata_data_producer_multiple_variable_minimum.py
@frozen
class Input4MIPsDatasetMetadataDataProducerMultipleVariableMinimum:
    """
    Minimum metadata required from input4MIPs dataset producer for a multi-variable file

    This is the minimum metadata required to create a valid
    [`Input4MIPsDataset`][input4mips_validation.dataset.Input4MIPsDataset] object using
    [`from_data_producer_minimum_information_multiple_variable`][input4mips_validation.dataset.dataset.Input4MIPsDataset.from_data_producer_minimum_information_multiple_variable].
    """

    grid_label: str = field()
    """
    Label that identfies the file's grid

    [TODO: cross-ref to the CVs]
    """

    nominal_resolution: str
    """Nominal resolution of the data in the file"""

    source_id: str
    """The ID of the file's source"""

    target_mip: str
    """The MIP that this file targets"""

    dataset_category: str
    """The file's category"""

    realm: str
    """The realm of the data in the file"""

dataset_category: str instance-attribute #

The file's category

grid_label: str = field() class-attribute instance-attribute #

Label that identfies the file's grid

[TODO: cross-ref to the CVs]

nominal_resolution: str instance-attribute #

Nominal resolution of the data in the file

realm: str instance-attribute #

The realm of the data in the file

source_id: str instance-attribute #

The ID of the file's source

target_mip: str instance-attribute #

The MIP that this file targets