Source code for ddlpy.utils
import dateutil.rrule
import itertools
import pandas as pd
import numpy as np
def date_series(start, end, freq=dateutil.rrule.MONTHLY):
"""return a list of start and end date over the timespan start[->end following the frequency rule"""
def pairwise(it):
"""return all sequential pairs"""
# loop over the iterator twice.
# tee it so we don't consume it twice
it0, it1 = itertools.tee(it)
i0 = itertools.islice(it0, None)
i1 = itertools.islice(it1, 1, None)
# merge to a list of pairs
return zip(i0, i1)
# go over the rrule, also include the end, return consequitive pairs
result = list(
pairwise(
list(dateutil.rrule.rrule(dtstart=start, until=end, freq=freq)) + [end]
)
)
# remove last one if empty (first of month until first of month)
if len(result) > 1 and result[-1][0] == result[-1][1]:
# remove it
del result[-1]
return result
[docs]
def simplify_dataframe(df: pd.DataFrame, always_preserve=[]):
"""
Drop columns with constant values from the dataframe and collect them
in a dictionary which is added as attrs of the dataframe.
The column Meetwaarde.Waarde_Alfanumeriek is also dropped if it is a duplicate of
Meetwaarde.Waarde_Numeriek.
The column names passed in `always_preserve` are preserved even if they are constant.
"""
# define which columns are constant
bool_constant = (df == df.iloc[0]).all(axis=0)
# drop Waarde_Alfanumeriek if duplicate of Waarde_Numeriek
str_num = "Meetwaarde.Waarde_Numeriek"
str_alf = "Meetwaarde.Waarde_Alfanumeriek"
if str_num in df.columns and str_alf in df.columns:
df_num = df[str_num]
df_alf = df[str_alf].astype(float)
if np.allclose(df_num, df_alf, equal_nan=True):
bool_constant[str_alf] = True
# preserve some columns (even if their values are constant) by setting them as not constant
for colname in always_preserve:
if colname not in df.columns:
raise ValueError(f"column '{colname}' not present in dataframe")
bool_constant[colname] = False
# constant columns are flattened and converted to dict of attrs
df_attrs = df.loc[:, bool_constant].iloc[0].to_dict()
# varying columns are kept in output dataframe
df_simple = df.loc[:, ~bool_constant].copy()
# attach as attrs to dataframe
df_simple.attrs = df_attrs
return df_simple
def code_description_attrs_from_dataframe(df: pd.DataFrame):
# create var_attrs_dict
colname_code_list = df.columns[df.columns.str.contains(".Code")]
colname_oms_list = df.columns[df.columns.str.contains(".Omschrijving")]
var_attrs_dict = {}
for colname_code, colname_oms in zip(colname_code_list, colname_oms_list):
meas_twocol = df[[colname_code, colname_oms]].drop_duplicates()
attr_dict = meas_twocol.set_index(colname_code)[colname_oms].to_dict()
# drop empty attribute names/keys since these are not supported when writing to netcdf file
if "" in attr_dict.keys():
attr_dict.pop("")
var_attrs_dict[colname_code] = attr_dict
return var_attrs_dict
[docs]
def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
"""
Converts the measurement dataframe to a xarray dataset. The dataframe is first
simplified with `simplify_dataframe()` to minimize the size of the netcdf dataset on
disk.
The timestamps are converted to UTC since xarray does not support non-UTC timestamps.
These can be converted to different timezones after loading the netcdf and converting
to a pandas dataframe with df.index.tz_convert().
Furthermore, all ".Omschrijving" variables are dropped and the information is added
as attributes to the Code variables.
Lastly, all string variables are converted to char arrays to minimize filesizes when
writing the netcdf with engine="netcdf4" or engine="h5netcdf". Char arrays are
always used with engine="scipy" or engine="netcdf4" with format="NETCDF4_CLASSIC".
"""
df_simple = simplify_dataframe(df, always_preserve=always_preserve)
# convert to UTC to please xarray/netcdf4 (otherwise we get invalid timestamps)
# adding a refdate with tzinfo is also possible but adds confusion and timestamps still have to be stored as UTC
if df_simple.index.tz is not None:
df_simple.index = df_simple.index.tz_convert(None)
# convert to xarray dataset and add ds_attrs
ds = df_simple.to_xarray()
ds = ds.assign_attrs(df_simple.attrs)
# assign attrs with code+omschrijving to each *.Code variable
var_attrs_dict = code_description_attrs_from_dataframe(df)
for varn in ds.data_vars:
if varn in var_attrs_dict.keys():
var_attrs = var_attrs_dict[varn]
ds[varn] = ds[varn].assign_attrs(var_attrs)
# drop .Omschrijving variables
omschrijving_vars = []
for varn in ds.data_vars:
if varn.endswith(".Omschrijving"):
omschrijving_vars.append(varn)
ds = ds.drop_vars(omschrijving_vars)
# enforce char arrays to reduce filesize for strings with engine netcdf4/h5netcdf
# char arrays are used per default with engine scipy/netcdf4_classic
for var in ds.data_vars:
if ds[var].dtype.kind in {"O", "U"}:
maxlen = int(ds[var].str.len().max())
ds[var].encoding = {"dtype": f"S{maxlen}"}
return ds