Source code for ddlpy.utils
import dateutil.rrule
import itertools
import pandas as pd
import numpy as np
def date_series(start, end, freq=dateutil.rrule.MONTHLY):
"""return a list of start and end date over the timespan start[->end following the frequency rule"""
def pairwise(it):
"""return all sequential pairs"""
# loop over the iterator twice.
# tee it so we don't consume it twice
it0, it1 = itertools.tee(it)
i0 = itertools.islice(it0, None)
i1 = itertools.islice(it1, 1, None)
# merge to a list of pairs
return zip(i0, i1)
# go over the rrule, also include the end, return consequitive pairs
result = list(
pairwise(
list(dateutil.rrule.rrule(dtstart=start, until=end, freq=freq)) + [end]
)
)
# remove last one if empty (first of month until first of month)
if len(result) > 1 and result[-1][0] == result[-1][1]:
# remove it
del result[-1]
return result
[docs]
def simplify_dataframe(df: pd.DataFrame, always_preserve=[]):
"""
Drop columns with constant values from the dataframe and collect them
in a dictionary which is added as attrs of the dataframe.
The column Meetwaarde.Waarde_Alfanumeriek is also dropped if it is a duplicate of
Meetwaarde.Waarde_Numeriek.
The column names passed in `always_preserve` are preserved even if they are constant.
"""
# define which columns are constant
bool_constant = (df == df.iloc[0]).all(axis=0)
# drop Waarde_Alfanumeriek if duplicate of Waarde_Numeriek
str_num = "Meetwaarde.Waarde_Numeriek"
str_alf = "Meetwaarde.Waarde_Alfanumeriek"
if str_num in df.columns and str_alf in df.columns:
df_num = df[str_num]
df_alf = df[str_alf].astype(float)
if np.allclose(df_num, df_alf, equal_nan=True):
bool_constant[str_alf] = True
# preserve some columns (even if their values are constant) by setting them as not constant
for colname in always_preserve:
if colname not in df.columns:
raise ValueError(f"column '{colname}' not present in dataframe")
bool_constant[colname] = False
# constant columns are flattened and converted to dict of attrs
df_attrs = df.loc[:, bool_constant].iloc[0].to_dict()
# varying columns are kept in output dataframe
df_simple = df.loc[:, ~bool_constant].copy()
# attach as attrs to dataframe
df_simple.attrs = df_attrs
return df_simple
def code_description_attrs_from_dataframe(df: pd.DataFrame):
# create var_attrs_dict
colname_code_list = df.columns[df.columns.str.contains(".Code")]
colname_oms_list = df.columns[df.columns.str.contains(".Omschrijving")]
var_attrs_dict = {}
for colname_code, colname_oms in zip(colname_code_list, colname_oms_list):
meas_twocol = df[[colname_code, colname_oms]].drop_duplicates()
attr_dict = meas_twocol.set_index(colname_code)[colname_oms].to_dict()
# drop empty attribute names/keys since these are not supported when writing to netcdf file
if "" in attr_dict.keys():
attr_dict.pop("")
var_attrs_dict[colname_code] = attr_dict
return var_attrs_dict
[docs]
def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
"""
Converts the measurement dataframe to a xarray dataset. The dataframe is first
simplified with `simplify_dataframe()` to minimize the size of the netcdf dataset on
disk.
The timestamps are converted to UTC since xarray does not support non-UTC timestamps.
These can be converted to different timezones after loading the netcdf and converting
to a pandas dataframe with df.index.tz_convert().
Furthermore, all ".Omschrijving" variables are dropped and the information is added
as attributes to the Code variables.
When writing the dataset to disk with ds.to_netcdf() it is recommended to use
`format="NETCDF3_CLASSIC"` or `format="NETCDF4_CLASSIC"` since this automatically
converts variables of dtype <U to |S which saves a lot of disk space for DDL data.
"""
df_simple = simplify_dataframe(df, always_preserve=always_preserve)
# convert to UTC to please xarray/netcdf4 (otherwise we get invalid timestamps)
# adding a refdate with tzinfo is also possible but adds confusion and timestamps still have to be stored as UTC
if df_simple.index.tz is not None:
df_simple.index = df_simple.index.tz_convert(None)
# convert to xarray dataset and add ds_attrs
ds = df_simple.to_xarray()
ds = ds.assign_attrs(df_simple.attrs)
# assign attrs with code+omschrijving to each *.Code variable
var_attrs_dict = code_description_attrs_from_dataframe(df)
for varn in ds.data_vars:
if varn in var_attrs_dict.keys():
var_attrs = var_attrs_dict[varn]
ds[varn] = ds[varn].assign_attrs(var_attrs)
# drop .Omschrijving variables
omschrijving_vars = []
for varn in ds.data_vars:
if varn.endswith(".Omschrijving"):
omschrijving_vars.append(varn)
ds = ds.drop_vars(omschrijving_vars)
return ds