Source code for dpyverification.datasources.csv
"""Datasources to fetch thresholds."""
from pathlib import Path
from typing import ClassVar, Self
import pandas as pd
import xarray as xr
from dpyverification.configuration.default.datasources import CsvConfig
from dpyverification.constants import DataSourceKind, DataType, StandardDim
from dpyverification.datasources.base import BaseDatasource
__all__ = [
"Csv",
"CsvConfig",
]
[docs]
class Csv(BaseDatasource):
"""Datasource for reading CSV files."""
kind: str = DataSourceKind.CSV
config_class = CsvConfig
supported_data_types: ClassVar[set[DataType]] = {
DataType.threshold,
}
def __init__(self, config: CsvConfig) -> None:
self.config: CsvConfig = config
self.data_array = xr.DataArray()
[docs]
def fetch_data(self) -> Self:
"""Parse thresholds from csv file."""
file_path = Path(self.config.directory) / self.config.filename
threshold_df = pd.read_csv(file_path)
# Check that the df has the correct structure
expected_columns = [
StandardDim.station,
StandardDim.variable,
StandardDim.threshold,
"value",
]
if not all(k in expected_columns for k in threshold_df.columns):
msg = f"Expected columns: {expected_columns}. Got: {threshold_df.columns}"
raise ValueError(msg)
# Convert it to the internal datamodel
self.data_array = threshold_df.set_index(
[StandardDim.station, StandardDim.variable, StandardDim.threshold],
).to_xarray()["value"]
# Filter the data array based on the configured station, variable and threshold ids
try:
self.data_array = self.data_array.sel(
station=self.config.stations,
variable=self.config.variables,
threshold=self.config.thresholds,
)
except KeyError as e:
msg = "One of the configured station, variable or threshold ids was not found in the . "
f"data. Details: {e}"
raise ValueError(msg) from e
# Set the data type as an attribute for later use in the verification process
self.data_array.attrs["data_type"] = "threshold" # type:ignore[misc]
return self