input4mips_validation.inference.from_data#
input4mips_validation.inference.from_data
#
Inference of metadata from data
VARIABLE_DATASET_CATEGORY_MAP = {'tos': 'SSTsAndSeaIce', 'siconc': 'SSTsAndSeaIce', 'sftof': 'SSTsAndSeaIce', 'mole_fraction_of_carbon_dioxide_in_air': 'GHGConcentrations', 'mole_fraction_of_methane_in_air': 'GHGConcentrations', 'mole_fraction_of_nitrous_oxide_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc116_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc218_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc3110_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc4112_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc5114_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc6116_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc7118_in_air': 'GHGConcentrations', 'mole_fraction_of_pfc318_in_air': 'GHGConcentrations', 'mole_fraction_of_carbon_tetrachloride_in_air': 'GHGConcentrations', 'mole_fraction_of_carbon_tetrafluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc11_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc113_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc114_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc115_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc12_in_air': 'GHGConcentrations', 'mole_fraction_of_dichloromethane_in_air': 'GHGConcentrations', 'mole_fraction_of_methyl_bromide_in_air': 'GHGConcentrations', 'mole_fraction_of_hcc140a_in_air': 'GHGConcentrations', 'mole_fraction_of_methyl_chloride_in_air': 'GHGConcentrations', 'mole_fraction_of_chloroform_in_air': 'GHGConcentrations', 'mole_fraction_of_halon1211_in_air': 'GHGConcentrations', 'mole_fraction_of_halon1301_in_air': 'GHGConcentrations', 'mole_fraction_of_halon2402_in_air': 'GHGConcentrations', 'mole_fraction_of_hcfc141b_in_air': 'GHGConcentrations', 'mole_fraction_of_hcfc142b_in_air': 'GHGConcentrations', 'mole_fraction_of_hcfc22_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc125_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc134a_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc143a_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc152a_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc227ea_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc23_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc236fa_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc245fa_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc32_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc365mfc_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc4310mee_in_air': 'GHGConcentrations', 'mole_fraction_of_nitrogen_trifluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_sulfur_hexafluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_sulfuryl_fluoride_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc11_eq_in_air': 'GHGConcentrations', 'mole_fraction_of_cfc12_eq_in_air': 'GHGConcentrations', 'mole_fraction_of_hfc134a_eq_in_air': 'GHGConcentrations', 'solar_irradiance_per_unit_wavelength': 'solar', 'solar_irradiance': 'solar'}
module-attribute
#
Mapping from variable names to dataset category
The variable names are generally CF standard names (i.e. can include underscores) rather than CMIP data request names (which are meant to have no underscores or other special characters).
TODO: move this into CVs rather than hard-coding here
VARIABLE_REALM_MAP = {'tos': 'ocean', 'siconc': 'seaIce', 'sftof': 'ocean', 'areacello': 'ocean', 'mole_fraction_of_carbon_dioxide_in_air': 'atmos', 'mole_fraction_of_methane_in_air': 'atmos', 'mole_fraction_of_nitrous_oxide_in_air': 'atmos', 'mole_fraction_of_pfc116_in_air': 'atmos', 'mole_fraction_of_pfc218_in_air': 'atmos', 'mole_fraction_of_pfc3110_in_air': 'atmos', 'mole_fraction_of_pfc4112_in_air': 'atmos', 'mole_fraction_of_pfc5114_in_air': 'atmos', 'mole_fraction_of_pfc6116_in_air': 'atmos', 'mole_fraction_of_pfc7118_in_air': 'atmos', 'mole_fraction_of_pfc318_in_air': 'atmos', 'mole_fraction_of_carbon_tetrachloride_in_air': 'atmos', 'mole_fraction_of_carbon_tetrafluoride_in_air': 'atmos', 'mole_fraction_of_cfc11_in_air': 'atmos', 'mole_fraction_of_cfc113_in_air': 'atmos', 'mole_fraction_of_cfc114_in_air': 'atmos', 'mole_fraction_of_cfc115_in_air': 'atmos', 'mole_fraction_of_cfc12_in_air': 'atmos', 'mole_fraction_of_dichloromethane_in_air': 'atmos', 'mole_fraction_of_methyl_bromide_in_air': 'atmos', 'mole_fraction_of_hcc140a_in_air': 'atmos', 'mole_fraction_of_methyl_chloride_in_air': 'atmos', 'mole_fraction_of_chloroform_in_air': 'atmos', 'mole_fraction_of_halon1211_in_air': 'atmos', 'mole_fraction_of_halon1301_in_air': 'atmos', 'mole_fraction_of_halon2402_in_air': 'atmos', 'mole_fraction_of_hcfc141b_in_air': 'atmos', 'mole_fraction_of_hcfc142b_in_air': 'atmos', 'mole_fraction_of_hcfc22_in_air': 'atmos', 'mole_fraction_of_hfc125_in_air': 'atmos', 'mole_fraction_of_hfc134a_in_air': 'atmos', 'mole_fraction_of_hfc143a_in_air': 'atmos', 'mole_fraction_of_hfc152a_in_air': 'atmos', 'mole_fraction_of_hfc227ea_in_air': 'atmos', 'mole_fraction_of_hfc23_in_air': 'atmos', 'mole_fraction_of_hfc236fa_in_air': 'atmos', 'mole_fraction_of_hfc245fa_in_air': 'atmos', 'mole_fraction_of_hfc32_in_air': 'atmos', 'mole_fraction_of_hfc365mfc_in_air': 'atmos', 'mole_fraction_of_hfc4310mee_in_air': 'atmos', 'mole_fraction_of_nitrogen_trifluoride_in_air': 'atmos', 'mole_fraction_of_sulfur_hexafluoride_in_air': 'atmos', 'mole_fraction_of_sulfuryl_fluoride_in_air': 'atmos', 'mole_fraction_of_cfc11_eq_in_air': 'atmos', 'mole_fraction_of_cfc12_eq_in_air': 'atmos', 'mole_fraction_of_hfc134a_eq_in_air': 'atmos', 'solar_irradiance_per_unit_wavelength': 'atmos', 'solar_irradiance': 'atmos', 'areacella': 'atmos'}
module-attribute
#
Mapping from variable names to realm
The variable names are generally CF standard names (i.e. can include underscores) rather than CMIP data request names (which are meant to have no underscores or other special characters).
TODO: move this into CVs rather than hard-coding here
BoundsInfo
#
Definition of the values used for bounds handling
We put this together for ease of explanation and conciseness.
Source code in src/input4mips_validation/inference/from_data.py
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | |
bounds_dim: str = 'bounds'
class-attribute
instance-attribute
#
The name of the bounds dimension in the data
bounds_dim_lower_val: int = 0
class-attribute
instance-attribute
#
Value of the lower bounds dimension, which allows us to select the lower bounds.
bounds_dim_upper_val: int = 1
class-attribute
instance-attribute
#
Value of the upper bounds dimension, which allows us to select the upper bounds.
time_bounds: str = 'time_bounds'
class-attribute
instance-attribute
#
Name of the variable which represents the bounds of the time axis
from_ds(ds, time_dimension='time')
classmethod
#
Initialise from a dataset
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
ds
|
Dataset
|
Dataset from which to initialise |
required |
time_dimension
|
str
|
The name of the time dimension in the dataset |
'time'
|
Returns:
| Type | Description |
|---|---|
BoundsInfo
|
Initialised class |
Source code in src/input4mips_validation/inference/from_data.py
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | |
FrequencyMetadataKeys
#
Definition of the keys used for frequency metadata
We put this together for ease of explanation and conciseness.
Source code in src/input4mips_validation/inference/from_data.py
frequency_metadata_key: str = 'frequency'
class-attribute
instance-attribute
#
The key in the data's metadata which points to information about the data's frequency
no_time_axis_frequency: str = 'fx'
class-attribute
instance-attribute
#
The value of frequency_metadata_key in the metadata which indicates
that the file has no time axis i.e. is fixed in time.
create_time_range_for_filename(time_start, time_end, ds_frequency, start_end_separator='-')
#
Create the time range information for the filename
It is safest to use this function with the output from
infer_time_start_time_end_for_filename
because that function correctly infers the start and end time from the data,
even when the data represents a climatology.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
time_start
|
datetime | datetime | datetime64
|
The start time (of the underlying dataset) |
required |
time_end
|
datetime | datetime | datetime64
|
The end time (of the underlying dataset) |
required |
ds_frequency
|
str
|
The frequency of the underlying dataset |
required |
start_end_separator
|
str
|
The string(s) to use to separate the start and end time. |
'-'
|
Returns:
| Type | Description |
|---|---|
str
|
The time-range information, formatted correctly given the underlying dataset's frequency. |
Source code in src/input4mips_validation/inference/from_data.py
ds_is_climatology(ds, time_dimension)
#
Determine whether a dataset represents a climatology or not
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
ds
|
Dataset
|
Dataset to check |
required |
time_dimension
|
str
|
The name of the time dimension in |
required |
Returns:
| Type | Description |
|---|---|
bool
|
Whether the dataset is a climatology or not |
Source code in src/input4mips_validation/inference/from_data.py
frequency_is_climatology(frequency)
#
Check whether the frequency information indicates that the data is a climatology
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
frequency
|
str
|
Frequency attribute value |
required |
Returns:
| Type | Description |
|---|---|
bool
|
Whether the data represents a climatology or not |
Source code in src/input4mips_validation/inference/from_data.py
get_climatology_bounds(ds, time_dimension='time')
#
Get the climatology bounds variable
This should only be used after having first checked that ds
is a climatology (
using e.g.
ds_is_climatology
).
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
ds
|
Dataset
|
Dataset |
required |
time_dimension
|
str
|
Time dimension in |
'time'
|
Returns:
| Type | Description |
|---|---|
DataArray
|
Climatology bounds variable |
Source code in src/input4mips_validation/inference/from_data.py
get_frequency_label_stem(ds, climatology, time_dimension, time_bounds, bounds_dim, bounds_dim_lower_val, bounds_dim_upper_val)
#
Get the frequency label's stem from data
This is mainly intended for internal use,
see infer_frequency
instead.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
ds
|
Dataset
|
Dataset |
required |
climatology
|
bool
|
Does this dataset represent a climatology? |
required |
time_dimension
|
str
|
Name of the time dimension in |
required |
time_bounds
|
str
|
Variable assumed to contain time bounds information |
required |
bounds_dim
|
str
|
The name of the bounds dimension |
required |
bounds_dim_lower_val
|
int
|
Value of the lower bounds dimension, which allows us to select the lower bounds. |
required |
bounds_dim_upper_val
|
int
|
Value of the upper bounds dimension, which allows us to select the upper bounds. |
required |
Returns:
| Type | Description |
|---|---|
str
|
Inferred frequency stem e.g. "mon", "yr". Climatology information is added in
|
Source code in src/input4mips_validation/inference/from_data.py
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 | |
infer_frequency(ds, no_time_axis_frequency, time_dimension='time', time_bounds='time_bounds', bounds_dim='bounds', bounds_dim_lower_val=0, bounds_dim_upper_val=1)
#
Infer frequency from data
TODO: work out if/where these rules are captured anywhere else These resource are helpful, but I'm not sure if they spell out the rules exactly:
- https://github.com/WCRP-CMIP/CMIP6_CVs/blob/main/CMIP6_frequency.json
- https://wcrp-cmip.github.io/WGCM_Infrastructure_Panel/Papers/CMIP6_global_attributes_filenames_CVs_v6.2.7.pdf
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
ds
|
Dataset
|
Dataset |
required |
no_time_axis_frequency
|
str
|
Value to return if the data has no time axis i.e. is a fixed field. |
required |
time_dimension
|
str
|
Name of the expected time dimension in If |
'time'
|
time_bounds
|
str
|
Variable assumed to contain time bounds information |
'time_bounds'
|
bounds_dim
|
str
|
The name of the bounds dimension |
'bounds'
|
bounds_dim_lower_val
|
int
|
Value of the lower bounds dimension, which allows us to select the lower bounds. |
0
|
bounds_dim_upper_val
|
int
|
Value of the upper bounds dimension, which allows us to select the upper bounds. |
1
|
Returns:
| Type | Description |
|---|---|
str
|
Inferred frequency |
Source code in src/input4mips_validation/inference/from_data.py
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | |
infer_time_start_time_end_for_filename(ds, frequency_metadata_key, no_time_axis_frequency, time_dimension)
#
Infer start and end time of the data in a dataset for creating file names
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
ds
|
Dataset
|
Dataset from which to infer start and end time |
required |
frequency_metadata_key
|
str
|
The key in the data's metadata which points to information about the data's frequency |
required |
no_time_axis_frequency
|
str
|
The value of |
required |
time_dimension
|
str
|
The time dimension of the data |
required |
Returns:
| Name | Type | Description |
|---|---|---|
time_start |
Union[datetime, datetime, datetime64, None]
|
Start time of the data |
time_end |
Union[datetime, datetime, datetime64, None]
|
End time of the data |
Source code in src/input4mips_validation/inference/from_data.py
is_daily_steps(step_start, step_end)
#
Determine whether the steps are daily
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
step_start
|
DataArray
|
Start of each step (e.g. start of each bound) |
required |
step_end
|
DataArray
|
End of each step (e.g. end of each bound) |
required |
Returns:
| Type | Description |
|---|---|
bool
|
|
Source code in src/input4mips_validation/inference/from_data.py
is_monthly_steps(step_start, step_end)
#
Determine whether the steps are monthly
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
step_start
|
DataArray
|
Start of each step (e.g. start of each bound) |
required |
step_end
|
DataArray
|
End of each step (e.g. end of each bound) |
required |
Returns:
| Type | Description |
|---|---|
bool
|
|
Source code in src/input4mips_validation/inference/from_data.py
is_yearly_steps(step_start, step_end)
#
Determine whether the steps are yearly
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
step_start
|
DataArray
|
Start of each step (e.g. start of each bound) |
required |
step_end
|
DataArray
|
End of each step (e.g. end of each bound) |
required |
Returns:
| Type | Description |
|---|---|
bool
|
|