Source code for ddlpy.utils
import dateutil.rrule
import itertools
import pandas as pd
def date_series(start, end, freq=dateutil.rrule.MONTHLY):
"""return a list of start and end date over the timespan start[->end following the frequency rule"""
def pairwise(it):
"""return all sequential pairs"""
# loop over the iterator twice.
# tee it so we don't consume it twice
it0, it1 = itertools.tee(it)
i0 = itertools.islice(it0, None)
i1 = itertools.islice(it1, 1, None)
# merge to a list of pairs
return zip(i0, i1)
# go over the rrule, also include the end, return consequitive pairs
result = list(
pairwise(
list(
dateutil.rrule.rrule(dtstart=start, until=end, freq=freq)
) + [end]
)
)
# remove last one if empty (first of month until first of month)
if len(result) > 1 and result[-1][0] == result[-1][1]:
# remove it
del result[-1]
return result
[docs]
def simplify_dataframe(df: pd.DataFrame):
"""
Drop columns with constant values from the dataframe and collect them
in a dictionary which is added as attrs of the dataframe.
"""
bool_constant = (df == df.iloc[0]).all()
# constant columns are flattened and converted to dict of attrs
df_attrs = df.loc[:, bool_constant].iloc[0].to_dict()
# varying columns are kept in output dataframe
df_simple = df.loc[:, ~bool_constant].copy()
# attach as attrs to dataframe
df_simple.attrs = df_attrs
return df_simple
def code_description_attrs_from_dataframe(df: pd.DataFrame):
# create var_attrs_dict
colname_code_list = df.columns[df.columns.str.contains(".Code")]
var_attrs_dict = {}
for colname_code in colname_code_list:
colname_oms = colname_code.replace(".Code",".Omschrijving")
meas_twocol = df[[colname_code,colname_oms]].drop_duplicates()
attr_dict = meas_twocol.set_index(colname_code)[colname_oms].to_dict()
var_attrs_dict[colname_code] = attr_dict
return var_attrs_dict
[docs]
def dataframe_to_xarray(df: pd.DataFrame, drop_if_constant=[]):
"""
Converts the measurement dataframe to a xarray dataset,
including several cleanups to minimize the size of the netcdf dataset on disk:
- The column 'Parameter_Wat_Omschrijving' is dropped (combination of information in other columns)
- The column 'Meetwaarde.Waarde_Alfanumeriek' is dropped if 'Meetwaarde.Waarde_Numeriek' is present (contains duplicate values in that case)
- All Omschrijving columns are dropped and added as attributes to the Code variables
- All NVT-only Code columns are dropped and added as ds attributes
- All location columns are dropped and added as ds attributes
- All drop_if_constant columns are dropped and added as ds attributes (if the values are indeed constant)
The timestamps are converted to UTC since xarray does not support non-UTC timestamps.
These can be converted to different timezones after loading the netcdf and converting
to a pandas dataframe with df.index.tz_convert().
When writing the dataset to disk with ds.to_netcdf() it is recommended to use
`format="NETCDF3_CLASSIC"` or `format="NETCDF4_CLASSIC"` since this automatically
converts variables of dtype <U to |S which saves a lot of disk space for DDL data.
"""
# create list of columns with duplicate info (often not constant), will be dropped
cols_bulky = ["Parameter_Wat_Omschrijving"]
if "Meetwaarde.Waarde_Alfanumeriek" in df.columns and 'Meetwaarde.Waarde_Numeriek' in df.columns:
# drop alfanumeriek if duplicate of numeriek # TODO: should not be returned by ddl
cols_bulky.append("Meetwaarde.Waarde_Alfanumeriek")
# create list of all omschrijving columns, will be dropped (added as ds[varn].attrs via code_description_attrs_from_dataframe())
cols_omschrijving = df.columns[df.columns.str.contains(".Omschrijving")].tolist()
# create list of all-NVT *.Code columns, will be dropped (codes added as ds.attrs)
bool_onlynvt_code = (df=='NVT').all(axis=0)
cols_onlynvt_code = df.columns[bool_onlynvt_code].tolist()
cols_onlynvt_code = [x for x in cols_onlynvt_code if x.endswith(".Code")]
# create list of location columns, will be dropped (added as ds.attrs)
cols_location = ['Code', 'Naam', 'Coordinatenstelsel', 'X', 'Y']
# add drop_if_constant colums to list if values are indeed constant, will be dropped (added as ds.attrs)
cols_constant = []
for colname in drop_if_constant:
assert colname in df.columns
if len(df[colname].drop_duplicates()) == 1:
cols_constant.append(colname)
# create ds attrs for all nvt/location/constant columns
ds_attrs = {}
attrs_columns = cols_onlynvt_code + cols_constant + cols_location
for colname in attrs_columns:
ds_attrs[colname] = df[colname].iloc[0]
# drop columns
drop_columns = (cols_bulky + cols_location + cols_constant +
cols_onlynvt_code + cols_omschrijving)
df_simple = df.drop(drop_columns, axis=1, errors='ignore')
# convert to UTC to please xarray/netcdf4 (otherwise we get invalid timestamps)
# adding a refdate with tzinfo is also possible but adds confusion and timestamps still have to be stored as UTC
if df_simple.index.tz is not None:
df_simple.index = df_simple.index.tz_convert(None)
# convert to xarray dataset and add ds_attrs
ds = df_simple.to_xarray()
ds = ds.assign_attrs(ds_attrs)
# assign attrs with code+omschrijving to each *.Code variable
var_attrs_dict = code_description_attrs_from_dataframe(df)
for varn in ds.data_vars:
if varn in var_attrs_dict.keys():
var_attrs = var_attrs_dict[varn]
ds[varn] = ds[varn].assign_attrs(var_attrs)
return ds