dataset_utils
Library for utility functions regarding an xarray dataset
add_variable(dataset, variable, variable_name)
Add variable to dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to add to |
required |
variable |
_xr.DataArray |
Variable containing new data |
required |
variable_name |
str |
Name of new variable |
required |
Exceptions:
Type | Description |
---|---|
ValueError |
When variable can not be added |
Returns:
Type | Description |
---|---|
_xr.Dataset |
original dataset |
Source code in utils/dataset_utils.py
def add_variable(
dataset: _xr.Dataset, variable: _xr.DataArray, variable_name: str
) -> _xr.Dataset:
"""Add variable to dataset.
Args:
dataset (_xr.Dataset): Dataset to add to
variable (_xr.DataArray): Variable containing new data
variable_name (str): Name of new variable
Raises:
ValueError: When variable can not be added
Returns:
_xr.Dataset: original dataset
"""
if not isinstance(variable, _xr.DataArray):
raise ValueError("ERROR: Cannot add variable to dataset")
dataset[variable_name] = (variable.dims, variable.data)
try:
dataset[variable_name] = (variable.dims, variable.data)
except ValueError as exc:
raise ValueError("ERROR: Cannot add variable to dataset") from exc
return dataset
copy_dataset(dataset)
Copy dataset to new dataset
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to remove variable from |
required |
variable |
str |
Variable to remove |
required |
Exceptions:
Type | Description |
---|---|
ValueError |
When variable can not be removed |
Returns:
Type | Description |
---|---|
_xr.Dataset |
Original dataset |
Source code in utils/dataset_utils.py
def copy_dataset(dataset: _xr.Dataset) -> _xr.Dataset:
"""Copy dataset to new dataset
Args:
dataset (_xr.Dataset): Dataset to remove variable from
variable (str): Variable to remove
Raises:
ValueError: When variable can not be removed
Returns:
_xr.Dataset: Original dataset
"""
try:
output_dataset = dataset.copy(deep=False)
except ValueError as exc:
raise ValueError("ERROR: Cannot copy dataset.") from exc
return output_dataset
create_composed_dataset(input_datasets, variables_to_use, mapping)
Creates a dataset based on the provided input datasets and the selected variables.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
input_datasets |
List[_xr.Dataset] |
inputs to copy the data from |
required |
variables_to_use |
List[str] |
selected variables to copy |
required |
mapping |
dict[str, str] |
mapping for variables to rename after copying |
required |
Returns:
Type | Description |
---|---|
_xr.Dataset |
composed dataset (with selected variables) |
Source code in utils/dataset_utils.py
def create_composed_dataset(
input_datasets: List[_xr.Dataset],
variables_to_use: List[str],
mapping: Optional[dict[str, str]],
) -> _xr.Dataset:
"""Creates a dataset based on the provided input datasets and
the selected variables.
Args:
input_datasets (List[_xr.Dataset]): inputs to copy the data from
variables_to_use (List[str]): selected variables to copy
mapping (dict[str, str]): mapping for variables to rename after copying
Returns:
_xr.Dataset: composed dataset (with selected variables)
"""
merged_dataset = merge_list_of_datasets(input_datasets)
cleaned_dataset = remove_all_variables_except(merged_dataset, variables_to_use)
if mapping is None or len(mapping) == 0:
return cleaned_dataset
return cleaned_dataset.rename_vars(mapping)
get_dependent_var_list(dataset, dummy_vars)
Obtain the list of variables in a dataset. The variables are recursively looked up based on the dummy variable. This is done to support XUgrid and to prevent invalid topologies. This also allows QuickPlot to visualize the results.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to search for dummy variable |
required |
dummy_vars |
List[str] |
dummy variables |
required |
Returns:
Type | Description |
---|---|
List[str] |
dependent variables |
Source code in utils/dataset_utils.py
def get_dependent_var_list(dataset: _xr.Dataset, dummy_vars) -> List:
"""Obtain the list of variables in a dataset.
The variables are
recursively looked up based on the dummy variable.
This is done to support XUgrid and to prevent invalid topologies.
This also allows QuickPlot to visualize the results.
Args:
dataset (_xr.Dataset): Dataset to search for dummy variable
dummy_vars (List[str]): dummy variables
Returns:
List[str]: dependent variables
"""
var_list = rec_search_dep_vars(dataset, dummy_vars, [], [])
var_list += dummy_vars
return _lu.remove_duplicates_from_list(var_list)
get_dependent_vars_by_var_name(dataset, var_name)
Get all the variables that are described in the attributes of the dummy variable, associated with the UGrid standard.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to get dependent variables from |
required |
var_name |
str |
the name of the dummy variable |
required |
Returns:
Type | Description |
---|---|
list[str] |
list of the dependent variables to copy |
Source code in utils/dataset_utils.py
def get_dependent_vars_by_var_name(dataset: _xr.Dataset, var_name: str) -> List[str]:
"""Get all the variables that are described in the attributes of the dummy variable,
associated with the UGrid standard.
Args:
dataset (_xr.Dataset): Dataset to get dependent variables from
var_name (str): the name of the dummy variable
Returns:
list[str]: list of the dependent variables to copy
"""
vars_to_check = ["_coordinates", "_connectivity", "bounds"]
attrs_list = []
attrs = dataset[var_name].attrs
for attr in attrs.items():
if any(attr[0].endswith(var_check) for var_check in vars_to_check):
attrs_list = list(set(attrs_list + attr[1].split(" ")))
return attrs_list
get_dummy_variable_in_ugrid(dataset)
Get the name of the variable that serves as the dummy variable in the UGrid.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to search for dummy variable |
required |
Returns:
Type | Description |
---|---|
str |
name of the dummy variable |
Source code in utils/dataset_utils.py
def get_dummy_variable_in_ugrid(dataset: _xr.Dataset) -> list:
"""Get the name of the variable that serves as the dummy variable in the UGrid.
Args:
dataset (_xr.Dataset): Dataset to search for dummy variable
Returns:
str: name of the dummy variable
"""
dummy = [
name
for name in dataset.data_vars
if ("cf_role", "mesh_topology") in dataset[name].attrs.items()
]
if len(dummy) == 0:
raise ValueError(
"No dummy variable defined and therefore input dataset does "
"not comply with UGrid convention."
)
return dummy
list_coords(dataset)
List coordinates in dataset
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to list variables from |
required |
Returns:
Type | Description |
---|---|
list[str] |
list_variables |
Source code in utils/dataset_utils.py
def list_coords(dataset: _xr.Dataset) -> list[str]:
"""List coordinates in dataset
Args:
dataset (_xr.Dataset): Dataset to list variables from
Returns:
list_variables
"""
return list((dataset.coords or {}).keys())
list_vars(dataset)
List variables in dataset
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to list variables from |
required |
Returns:
Type | Description |
---|---|
list[str] |
list_variables |
Source code in utils/dataset_utils.py
def list_vars(dataset: _xr.Dataset) -> list[str]:
"""List variables in dataset
Args:
dataset (_xr.Dataset): Dataset to list variables from
Returns:
list_variables
"""
return list((dataset.data_vars or {}).keys())
merge_datasets(dataset1, dataset2)
Merge two datasets into one dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset1 |
_xr.Dataset |
Dataset 1 to merge |
required |
dataset2 |
_xr.Dataset |
Dataset 2 to merge |
required |
Exceptions:
Type | Description |
---|---|
ValueError |
When datasets cannot be merged |
Returns:
Type | Description |
---|---|
_xr.Dataset |
Original dataset |
Source code in utils/dataset_utils.py
def merge_datasets(dataset1: _xr.Dataset, dataset2: _xr.Dataset) -> _xr.Dataset:
"""Merge two datasets into one dataset.
Args:
dataset1 (_xr.Dataset): Dataset 1 to merge
dataset2 (_xr.Dataset): Dataset 2 to merge
Raises:
ValueError: When datasets cannot be merged
Returns:
_xr.Dataset: Original dataset
"""
try:
output_dataset = dataset1.merge(dataset2, compat="identical")
except ValueError as exc:
raise ValueError(f"ERROR: Cannot merge {dataset1} and {dataset2}.") from exc
return output_dataset
merge_list_of_datasets(list_datasets)
Merge list of datasets into 1 dataset
Parameters:
Name | Type | Description | Default |
---|---|---|---|
list_datasets |
list |
list of datasets to merge |
required |
Exceptions:
Type | Description |
---|---|
ValueError |
When datasets cannot be merged |
Returns:
Type | Description |
---|---|
_xr.Dataset |
Original dataset |
Source code in utils/dataset_utils.py
def merge_list_of_datasets(list_datasets: list[_xr.Dataset]) -> _xr.Dataset:
"""Merge list of datasets into 1 dataset
Args:
list_datasets (list): list of datasets to merge
Raises:
ValueError: When datasets cannot be merged
Returns:
_xr.Dataset: Original dataset
"""
try:
output_dataset = _xr.merge(list_datasets, compat="identical")
except ValueError as exc:
raise ValueError(f"ERROR: Cannot merge {list_datasets}.") from exc
return output_dataset
rec_search_dep_vars(dataset, var_list, dep_vars, checked_vars)
Recursive function to loop over all variables defined in the attribute of the dummy variable to find which are dependent and also the variables that are then again dependent on those variables etc.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
the dataset to check |
required |
var_list |
List[str] |
a list of dummy variable names to start the check |
required |
dep_vars |
List[str] |
a list of dependent variables found |
required |
checked_vars |
List[str] |
a list of variables that have already been checked in this function (it's a check so the function does not endlessly keep searching in the variables) |
required |
Returns:
Type | Description |
---|---|
list[str] |
list of names of dependent variables |
Source code in utils/dataset_utils.py
def rec_search_dep_vars(
dataset: _xr.Dataset,
var_list: List[str],
dep_vars: List[str],
checked_vars: List[str],
) -> list[str]:
"""Recursive function to loop over all variables defined in the
attribute of the dummy variable to find which are dependent and
also the variables that are then again dependent on those variables etc.
Args:
dataset (_xr.Dataset): the dataset to check
var_list (List[str]): a list of dummy variable names to start the check
dep_vars (List[str]): a list of dependent variables found
checked_vars (List[str]): a list of variables that have already been
checked in this function (it's a check so the function does not endlessly
keep searching in the variables)
Returns:
list[str]: list of names of dependent variables
"""
for var_name in var_list:
if var_name not in checked_vars:
dep_var = get_dependent_vars_by_var_name(dataset, var_name)
checked_vars.append(var_name)
if len(dep_var) > 0:
dep_vars = list(set(dep_var + dep_vars))
dep_vars = list(
set(
dep_vars
+ rec_search_dep_vars(dataset, dep_var, dep_vars, checked_vars)
)
)
return dep_vars
reduce_dataset_for_writing(dataset, save_only_variables, logger)
Reduce dataset before writing by only saving selected variables
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
DataSet |
dataset |
required |
save_only_variables |
List[str] |
optional list of variables to be saved. If |
required |
Exceptions:
Type | Description |
---|---|
OSError |
If save_only_variables do not exist in dataset |
Returns:
Type | Description |
---|---|
dataset |
Source code in utils/dataset_utils.py
def reduce_dataset_for_writing(
dataset: _xr.Dataset, save_only_variables: List[str], logger: ILogger
):
"""Reduce dataset before writing by only saving selected variables
Args:
dataset (DataSet): dataset
save_only_variables (List[str]): optional list of variables to be saved. If
empty, all variables are saved
Raises:
OSError: If save_only_variables do not exist in dataset
Returns:
dataset
"""
for var in save_only_variables:
if var not in dataset:
msg = f"ERROR: variable {var} is not present in dataset"
logger.log_error(msg)
raise OSError(msg)
dataset = remove_all_variables_except(dataset, save_only_variables)
return dataset
remove_all_variables_except(dataset, variables_to_keep)
Remove all variables from dataset except provided list of variables.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to remove variables from |
required |
variables_to_keep |
List[str] |
selected variables to keep |
required |
Returns:
Type | Description |
---|---|
_xr.Dataset |
reduced dataset (containing selected variables) |
Source code in utils/dataset_utils.py
def remove_all_variables_except(
dataset: _xr.Dataset, variables_to_keep: List[str]
) -> _xr.Dataset:
"""Remove all variables from dataset except provided list of variables.
Args:
dataset (_xr.Dataset): Dataset to remove variables from
variables_to_keep (List[str]): selected variables to keep
Returns:
_xr.Dataset: reduced dataset (containing selected variables)
"""
dummy_var = get_dummy_variable_in_ugrid(dataset)
dependent_var_list = get_dependent_var_list(dataset, dummy_var)
variables_to_keep += dummy_var + dependent_var_list
all_variables = list_vars(dataset)
variables_to_remove = [
item for item in all_variables if item not in list(variables_to_keep)
]
cleaned_dataset = remove_variables(dataset, variables_to_remove)
return cleaned_dataset
remove_variables(dataset, variables)
Remove variable from dataset
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to remove variable from |
required |
variables |
str/list |
Variable(s) to remove |
required |
Exceptions:
Type | Description |
---|---|
ValueError |
When variable can not be removed |
Returns:
Type | Description |
---|---|
_xr.Dataset |
Original dataset |
Source code in utils/dataset_utils.py
def remove_variables(dataset: _xr.Dataset, variables: list[str]) -> _xr.Dataset:
"""Remove variable from dataset
Args:
dataset (_xr.Dataset): Dataset to remove variable from
variables (str/list): Variable(s) to remove
Raises:
ValueError: When variable can not be removed
Returns:
_xr.Dataset: Original dataset
"""
try:
dataset = dataset.drop_vars(variables)
except ValueError as exc:
raise ValueError(f"ERROR: Cannot remove {variables} from dataset.") from exc
return dataset
rename_variable(dataset, variable_old, variable_new)
Rename variable in dataset
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
_xr.Dataset |
Dataset to remove variable from |
required |
variable_old |
str |
Variable to rename, old name |
required |
variable_new |
str |
Variable to rename, new name |
required |
Exceptions:
Type | Description |
---|---|
ValueError |
When variable can not be renamed |
Returns:
Type | Description |
---|---|
_xr.Dataset |
Original dataset |
Source code in utils/dataset_utils.py
def rename_variable(
dataset: _xr.Dataset, variable_old: str, variable_new: str
) -> _xr.Dataset:
"""Rename variable in dataset
Args:
dataset (_xr.Dataset): Dataset to remove variable from
variable_old (str): Variable to rename, old name
variable_new (str): Variable to rename, new name
Raises:
ValueError: When variable can not be renamed
Returns:
_xr.Dataset: Original dataset
"""
mapping_dict = {variable_old: variable_new}
try:
output_dataset = dataset.rename(mapping_dict)
except ValueError as exc:
raise ValueError(
f"ERROR: Cannot rename variable {variable_old} to {variable_new}."
) from exc
return output_dataset