Source code for ddlpy.utils

import dateutil.rrule
import itertools
import pandas as pd
import numpy as np


def date_series(start, end, freq=dateutil.rrule.MONTHLY):
    """return a list of start and end date over the timespan start[->end following the frequency rule"""

    def pairwise(it):
        """return all sequential pairs"""
        # loop over the iterator twice.
        # tee it so we don't consume it twice
        it0, it1 = itertools.tee(it)
        i0 = itertools.islice(it0, None)
        i1 = itertools.islice(it1, 1, None)
        # merge to a list of pairs
        return zip(i0, i1)

    # go over the rrule, also include the end, return consequitive pairs
    result = list(
        pairwise(
            list(dateutil.rrule.rrule(dtstart=start, until=end, freq=freq)) + [end]
        )
    )
    # remove last one if empty (first of month until first of month)
    if len(result) > 1 and result[-1][0] == result[-1][1]:
        # remove it
        del result[-1]
    return result


[docs] def simplify_dataframe(df: pd.DataFrame, always_preserve=[]): """ Drop columns with constant values from the dataframe and collect them in a dictionary which is added as attrs of the dataframe. The column Meetwaarde.Waarde_Alfanumeriek is also dropped if it is a duplicate of Meetwaarde.Waarde_Numeriek. The column names passed in `always_preserve` are preserved even if they are constant. """ # define which columns are constant bool_constant = (df == df.iloc[0]).all(axis=0) # drop Waarde_Alfanumeriek if duplicate of Waarde_Numeriek str_num = "Meetwaarde.Waarde_Numeriek" str_alf = "Meetwaarde.Waarde_Alfanumeriek" if str_num in df.columns and str_alf in df.columns: df_num = df[str_num] df_alf = df[str_alf].astype(float) if np.allclose(df_num, df_alf, equal_nan=True): bool_constant[str_alf] = True # preserve some columns (even if their values are constant) by setting them as not constant for colname in always_preserve: if colname not in df.columns: raise ValueError(f"column '{colname}' not present in dataframe") bool_constant[colname] = False # constant columns are flattened and converted to dict of attrs df_attrs = df.loc[:, bool_constant].iloc[0].to_dict() # varying columns are kept in output dataframe df_simple = df.loc[:, ~bool_constant].copy() # attach as attrs to dataframe df_simple.attrs = df_attrs return df_simple
def code_description_attrs_from_dataframe(df: pd.DataFrame): # create var_attrs_dict colname_code_list = df.columns[df.columns.str.contains(".Code")] colname_oms_list = df.columns[df.columns.str.contains(".Omschrijving")] var_attrs_dict = {} for colname_code, colname_oms in zip(colname_code_list, colname_oms_list): meas_twocol = df[[colname_code, colname_oms]].drop_duplicates() attr_dict = meas_twocol.set_index(colname_code)[colname_oms].to_dict() # drop empty attribute names/keys since these are not supported when writing to netcdf file if "" in attr_dict.keys(): attr_dict.pop("") var_attrs_dict[colname_code] = attr_dict return var_attrs_dict
[docs] def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]): """ Converts the measurement dataframe to a xarray dataset. The dataframe is first simplified with `simplify_dataframe()` to minimize the size of the netcdf dataset on disk. The timestamps are converted to UTC since xarray does not support non-UTC timestamps. These can be converted to different timezones after loading the netcdf and converting to a pandas dataframe with df.index.tz_convert(). Furthermore, all ".Omschrijving" variables are dropped and the information is added as attributes to the Code variables. Lastly, all string variables are converted to char arrays to minimize filesizes when writing the netcdf with engine="netcdf4" or engine="h5netcdf". Char arrays are always used with engine="scipy" or engine="netcdf4" with format="NETCDF4_CLASSIC". """ df_simple = simplify_dataframe(df, always_preserve=always_preserve) # convert to UTC to please xarray/netcdf4 (otherwise we get invalid timestamps) # adding a refdate with tzinfo is also possible but adds confusion and timestamps still have to be stored as UTC if df_simple.index.tz is not None: df_simple.index = df_simple.index.tz_convert(None) # convert to xarray dataset and add ds_attrs ds = df_simple.to_xarray() ds = ds.assign_attrs(df_simple.attrs) # assign attrs with code+omschrijving to each *.Code variable var_attrs_dict = code_description_attrs_from_dataframe(df) for varn in ds.data_vars: if varn in var_attrs_dict.keys(): var_attrs = var_attrs_dict[varn] ds[varn] = ds[varn].assign_attrs(var_attrs) # drop .Omschrijving variables omschrijving_vars = [] for varn in ds.data_vars: if varn.endswith(".Omschrijving"): omschrijving_vars.append(varn) ds = ds.drop_vars(omschrijving_vars) # enforce char arrays to reduce filesize for strings with engine netcdf4/h5netcdf # char arrays are used per default with engine scipy/netcdf4_classic for var in ds.data_vars: if ds[var].dtype.kind in {"O", "U"}: maxlen = int(ds[var].str.len().max()) ds[var].encoding = {"dtype": f"S{maxlen}"} return ds