Source code for ddlpy.utils

import dateutil.rrule
import itertools
import pandas as pd


def date_series(start, end, freq=dateutil.rrule.MONTHLY):
    """return a list of start and end date over the timespan start[->end following the frequency rule"""
    def pairwise(it):
        """return all sequential pairs"""
        # loop over the iterator twice.
        # tee it so we don't consume it twice
        it0, it1 = itertools.tee(it)
        i0 = itertools.islice(it0, None)
        i1 = itertools.islice(it1, 1, None)
        # merge to a list of pairs
        return zip(i0, i1)

    # go over the rrule, also include the end, return consequitive pairs
    result = list(
        pairwise(
            list(
                dateutil.rrule.rrule(dtstart=start, until=end, freq=freq)
            ) + [end]
        )
    )
    # remove last one if empty (first of month until first of month)
    if len(result) > 1 and result[-1][0] == result[-1][1]:
        # remove it
        del result[-1]
    return result


[docs] def simplify_dataframe(df: pd.DataFrame): """ Drop columns with constant values from the dataframe and collect them in a dictionary which is added as attrs of the dataframe. """ bool_constant = (df == df.iloc[0]).all() # constant columns are flattened and converted to dict of attrs df_attrs = df.loc[:, bool_constant].iloc[0].to_dict() # varying columns are kept in output dataframe df_simple = df.loc[:, ~bool_constant].copy() # attach as attrs to dataframe df_simple.attrs = df_attrs return df_simple
def code_description_attrs_from_dataframe(df: pd.DataFrame): # create var_attrs_dict colname_code_list = df.columns[df.columns.str.contains(".Code")] var_attrs_dict = {} for colname_code in colname_code_list: colname_oms = colname_code.replace(".Code",".Omschrijving") meas_twocol = df[[colname_code,colname_oms]].drop_duplicates() attr_dict = meas_twocol.set_index(colname_code)[colname_oms].to_dict() var_attrs_dict[colname_code] = attr_dict return var_attrs_dict
[docs] def dataframe_to_xarray(df: pd.DataFrame, drop_if_constant=[]): """ Converts the measurement dataframe to a xarray dataset, including several cleanups to minimize the size of the netcdf dataset on disk: - The column 'Parameter_Wat_Omschrijving' is dropped (combination of information in other columns) - The column 'Meetwaarde.Waarde_Alfanumeriek' is dropped if 'Meetwaarde.Waarde_Numeriek' is present (contains duplicate values in that case) - All Omschrijving columns are dropped and added as attributes to the Code variables - All NVT-only Code columns are dropped and added as ds attributes - All location columns are dropped and added as ds attributes - All drop_if_constant columns are dropped and added as ds attributes (if the values are indeed constant) The timestamps are converted to UTC since xarray does not support non-UTC timestamps. These can be converted to different timezones after loading the netcdf and converting to a pandas dataframe with df.index.tz_convert(). When writing the dataset to disk with ds.to_netcdf() it is recommended to use `format="NETCDF3_CLASSIC"` or `format="NETCDF4_CLASSIC"` since this automatically converts variables of dtype <U to |S which saves a lot of disk space for DDL data. """ # create list of columns with duplicate info (often not constant), will be dropped cols_bulky = ["Parameter_Wat_Omschrijving"] if "Meetwaarde.Waarde_Alfanumeriek" in df.columns and 'Meetwaarde.Waarde_Numeriek' in df.columns: # drop alfanumeriek if duplicate of numeriek # TODO: should not be returned by ddl cols_bulky.append("Meetwaarde.Waarde_Alfanumeriek") # create list of all omschrijving columns, will be dropped (added as ds[varn].attrs via code_description_attrs_from_dataframe()) cols_omschrijving = df.columns[df.columns.str.contains(".Omschrijving")].tolist() # create list of all-NVT *.Code columns, will be dropped (codes added as ds.attrs) bool_onlynvt_code = (df=='NVT').all(axis=0) cols_onlynvt_code = df.columns[bool_onlynvt_code].tolist() cols_onlynvt_code = [x for x in cols_onlynvt_code if x.endswith(".Code")] # create list of location columns, will be dropped (added as ds.attrs) cols_location = ['Code', 'Naam', 'Coordinatenstelsel', 'X', 'Y'] # add drop_if_constant colums to list if values are indeed constant, will be dropped (added as ds.attrs) cols_constant = [] for colname in drop_if_constant: assert colname in df.columns if len(df[colname].drop_duplicates()) == 1: cols_constant.append(colname) # create ds attrs for all nvt/location/constant columns ds_attrs = {} attrs_columns = cols_onlynvt_code + cols_constant + cols_location for colname in attrs_columns: ds_attrs[colname] = df[colname].iloc[0] # drop columns drop_columns = (cols_bulky + cols_location + cols_constant + cols_onlynvt_code + cols_omschrijving) df_simple = df.drop(drop_columns, axis=1, errors='ignore') # convert to UTC to please xarray/netcdf4 (otherwise we get invalid timestamps) # adding a refdate with tzinfo is also possible but adds confusion and timestamps still have to be stored as UTC if df_simple.index.tz is not None: df_simple.index = df_simple.index.tz_convert(None) # convert to xarray dataset and add ds_attrs ds = df_simple.to_xarray() ds = ds.assign_attrs(ds_attrs) # assign attrs with code+omschrijving to each *.Code variable var_attrs_dict = code_description_attrs_from_dataframe(df) for varn in ds.data_vars: if varn in var_attrs_dict.keys(): var_attrs = var_attrs_dict[varn] ds[varn] = ds[varn].assign_attrs(var_attrs) return ds