Skip to content

Period Week Feature Set

Module for the PeriodWeekFeatureSet class and PeriodWeekFeatureConfig class.

Author: Daniel Wertheimer

PeriodWeekFeatureConfig

Bases: FeatureConfig

Configuration class for period-based weekly feature calculation.

Attributes:

Name Type Description
lag_weeks list[int]

List of integers representing the number of weeks to lag. (default is [2, 4, 8, 12])

calculation_columns list[str]

List of columns to perform calculations on.

deviation_from_mean_weeks list[int]

List of integers representing weeks for mean deviation calculation.

calculate_deltas bool

Flag to determine if deltas should be calculated.

calculate_percentage_change bool

Flag to determine if percentage change should be calculated.

resolve_divide_by_zero bool

Flag to determine if divide by zero errors should be resolved.

Source code in amee_utils/feature_generator/feature_set/period_week.py
@attrs.define
class PeriodWeekFeatureConfig(FeatureConfig):
    """
    Configuration class for period-based weekly feature calculation.

    Attributes
    ----------
    lag_weeks : list[int]
        List of integers representing the number of weeks to lag. (default is [2, 4, 8, 12])
    calculation_columns : list[str]
        List of columns to perform calculations on.
    deviation_from_mean_weeks : list[int]
        List of integers representing weeks for mean deviation calculation.
    calculate_deltas : bool
        Flag to determine if deltas should be calculated.
    calculate_percentage_change : bool
        Flag to determine if percentage change should be calculated.
    resolve_divide_by_zero : bool
        Flag to determine if divide by zero errors should be resolved.
    """

    lag_weeks: list[int] = [2, 4, 8, 12]
    calculation_columns: list[str] = []
    deviation_from_mean_weeks: list[int] = []
    calculate_deltas: bool = False
    calculate_percentage_change: bool = False
    resolve_divide_by_zero: bool = False

    def __attrs_post_init__(self):
        """
        Post-initialisation validation for PeriodWeekFeatureConfig.

        Ensures valid types and values for the attributes.
        """
        if not isinstance(self.deviation_from_mean_weeks, list):
            raise ValueError("deviation_from_mean_weeks must be a list of integers.")
        if not self.calculation_columns:
            raise ValueError("calculation_columns must be a non-empty list of strings.")
        if self.resolve_divide_by_zero and (not self.calculate_percentage_change or not self.calculate_deltas):
            raise ValueError(
                "resolve_divide_by_zero can only be True if calculate_percentage_change and calculate_deltas are True."
            )

PeriodWeekFeatureSet

Bases: FeatureSet[PeriodWeekFeatureConfig]

FeatureSet class for calculating period-based weekly features.

Methods:

Name Description
calculate

Calculate the features based on the given configurations.

deviation_from_mean

Calculate the deviation from mean for specified columns.

_calculate_deltas

Calculate the deltas between specified columns.

Source code in amee_utils/feature_generator/feature_set/period_week.py
class PeriodWeekFeatureSet(FeatureSet[PeriodWeekFeatureConfig]):
    """
    FeatureSet class for calculating period-based weekly features.

    Methods
    -------
    calculate(df, dataset_config, feature_config, calculation_date) -> DataFrame
        Calculate the features based on the given configurations.

    deviation_from_mean(df, key_cols, agg_col, mean_weeks) -> DataFrame
        Calculate the deviation from mean for specified columns.

    _calculate_deltas(df, key_cols) -> DataFrame
        Calculate the deltas between specified columns.
    """

    def calculate(
        self,
        df: DataFrame,
        dataset_config: DatasetConfig,
        feature_config: PeriodWeekFeatureConfig,
        calculation_date: datetime,
    ) -> DataFrame:
        """
        Calculate period-based weekly features.

        Parameters
        ----------
        df : DataFrame
            Input DataFrame.
        dataset_config : DatasetConfig
            Configuration for the dataset.
        feature_config : PeriodWeekFeatureConfig
            Configuration for the feature calculation.
        calculation_date : datetime
            Date for which the calculation is performed.

        Returns
        -------
        DataFrame
            DataFrame with the calculated features.
        """
        df = create_week_date_col(df=df, date_col=dataset_config.date_col)
        final_df = df.select(dataset_config.key_cols).distinct()

        strategies = [SumAggregation(), MeanAggregation(), StddevAggregation()]

        for col in feature_config.calculation_columns:
            lagged_aggregation = LaggedAggregation(
                periods_list=feature_config.lag_weeks,
                time_col="week",
                lag_type="period_week",
            )
            _tmp = lagged_aggregation.apply(df, col, dataset_config.key_cols, strategies)
            final_df = final_df.join(_tmp, on=dataset_config.key_cols, how="left")

        for col in feature_config.calculation_columns:
            for mean_weeks in feature_config.deviation_from_mean_weeks:
                _tmp = self.deviation_from_mean(df, dataset_config.key_cols, col, mean_weeks)
                final_df = final_df.join(_tmp, on=dataset_config.key_cols, how="left")

        if feature_config.calculate_deltas:
            final_df = self._calculate_deltas(final_df, dataset_config.key_cols)

        if feature_config.calculate_percentage_change:
            final_df = calculate_percentage_change(
                df=final_df,
                key_cols=dataset_config.key_cols,
                resolve_division_by_zero=feature_config.resolve_divide_by_zero,
            )

        return final_df

    def deviation_from_mean(
        self,
        df: DataFrame,
        key_cols: list[str],
        agg_col: str,
        mean_weeks: int,
    ) -> DataFrame:
        """
        Calculate the deviation from mean for specified columns.

        Parameters
        ----------
        df : DataFrame
            Input DataFrame.
        key_cols : list[str]
            List of key columns to group by.
        agg_col : str
            Column name to aggregate.
        mean_weeks : int
            Number of weeks to calculate mean over.

        Returns
        -------
        DataFrame
            DataFrame with deviation from mean features.
        """
        mean_df = create_period_week_lag_df(df, mean_weeks, "week")
        mean_df = mean_df.groupBy(*key_cols).agg(F.mean(agg_col).alias(f"MEAN_{agg_col.upper()}_PW{mean_weeks}"))

        single_week_df = create_period_week_lag_df(df, 1, "week")
        single_week_df = single_week_df.groupBy(*key_cols).agg(F.sum(agg_col).alias(f"SUM_{agg_col.upper()}_PW1"))

        result_df = single_week_df.join(mean_df, on=key_cols, how="left")
        result_df = result_df.withColumn(
            f"DEV_{agg_col.upper()}_PW{mean_weeks}",
            F.col(f"SUM_{agg_col.upper()}_PW1") - F.col(f"MEAN_{agg_col.upper()}_PW{mean_weeks}"),
        )

        return result_df.select(*key_cols, f"DEV_{agg_col.upper()}_PW{mean_weeks}")

    def _calculate_deltas(self, df: DataFrame, key_cols: list[str]) -> DataFrame:
        """
        Calculate the deltas between specified columns.

        Parameters
        ----------
        df : DataFrame
            Input DataFrame.
        key_cols : list[str]
            List of key columns to group by.

        Returns
        -------
        DataFrame
            DataFrame with delta calculations.
        """
        sum_cols = [col for col in df.columns if col.startswith("SUM_") and "_PW" in col]
        column_pairs = get_pairs_from_columns(sum_cols)

        for lagged_column_pair in column_pairs:
            for pair in lagged_column_pair.sorted_pairs:
                delta_name = f"DELTA_{pair[0].split('_', 1)[1]}_{pair[1].split('_', 1)[1]}"
                df = df.withColumn(delta_name, F.col(pair[0]) - F.col(pair[1]))

        return df

calculate(df, dataset_config, feature_config, calculation_date)

Calculate period-based weekly features.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
dataset_config DatasetConfig

Configuration for the dataset.

required
feature_config PeriodWeekFeatureConfig

Configuration for the feature calculation.

required
calculation_date datetime

Date for which the calculation is performed.

required

Returns:

Type Description
DataFrame

DataFrame with the calculated features.

Source code in amee_utils/feature_generator/feature_set/period_week.py
def calculate(
    self,
    df: DataFrame,
    dataset_config: DatasetConfig,
    feature_config: PeriodWeekFeatureConfig,
    calculation_date: datetime,
) -> DataFrame:
    """
    Calculate period-based weekly features.

    Parameters
    ----------
    df : DataFrame
        Input DataFrame.
    dataset_config : DatasetConfig
        Configuration for the dataset.
    feature_config : PeriodWeekFeatureConfig
        Configuration for the feature calculation.
    calculation_date : datetime
        Date for which the calculation is performed.

    Returns
    -------
    DataFrame
        DataFrame with the calculated features.
    """
    df = create_week_date_col(df=df, date_col=dataset_config.date_col)
    final_df = df.select(dataset_config.key_cols).distinct()

    strategies = [SumAggregation(), MeanAggregation(), StddevAggregation()]

    for col in feature_config.calculation_columns:
        lagged_aggregation = LaggedAggregation(
            periods_list=feature_config.lag_weeks,
            time_col="week",
            lag_type="period_week",
        )
        _tmp = lagged_aggregation.apply(df, col, dataset_config.key_cols, strategies)
        final_df = final_df.join(_tmp, on=dataset_config.key_cols, how="left")

    for col in feature_config.calculation_columns:
        for mean_weeks in feature_config.deviation_from_mean_weeks:
            _tmp = self.deviation_from_mean(df, dataset_config.key_cols, col, mean_weeks)
            final_df = final_df.join(_tmp, on=dataset_config.key_cols, how="left")

    if feature_config.calculate_deltas:
        final_df = self._calculate_deltas(final_df, dataset_config.key_cols)

    if feature_config.calculate_percentage_change:
        final_df = calculate_percentage_change(
            df=final_df,
            key_cols=dataset_config.key_cols,
            resolve_division_by_zero=feature_config.resolve_divide_by_zero,
        )

    return final_df

deviation_from_mean(df, key_cols, agg_col, mean_weeks)

Calculate the deviation from mean for specified columns.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
key_cols list[str]

List of key columns to group by.

required
agg_col str

Column name to aggregate.

required
mean_weeks int

Number of weeks to calculate mean over.

required

Returns:

Type Description
DataFrame

DataFrame with deviation from mean features.

Source code in amee_utils/feature_generator/feature_set/period_week.py
def deviation_from_mean(
    self,
    df: DataFrame,
    key_cols: list[str],
    agg_col: str,
    mean_weeks: int,
) -> DataFrame:
    """
    Calculate the deviation from mean for specified columns.

    Parameters
    ----------
    df : DataFrame
        Input DataFrame.
    key_cols : list[str]
        List of key columns to group by.
    agg_col : str
        Column name to aggregate.
    mean_weeks : int
        Number of weeks to calculate mean over.

    Returns
    -------
    DataFrame
        DataFrame with deviation from mean features.
    """
    mean_df = create_period_week_lag_df(df, mean_weeks, "week")
    mean_df = mean_df.groupBy(*key_cols).agg(F.mean(agg_col).alias(f"MEAN_{agg_col.upper()}_PW{mean_weeks}"))

    single_week_df = create_period_week_lag_df(df, 1, "week")
    single_week_df = single_week_df.groupBy(*key_cols).agg(F.sum(agg_col).alias(f"SUM_{agg_col.upper()}_PW1"))

    result_df = single_week_df.join(mean_df, on=key_cols, how="left")
    result_df = result_df.withColumn(
        f"DEV_{agg_col.upper()}_PW{mean_weeks}",
        F.col(f"SUM_{agg_col.upper()}_PW1") - F.col(f"MEAN_{agg_col.upper()}_PW{mean_weeks}"),
    )

    return result_df.select(*key_cols, f"DEV_{agg_col.upper()}_PW{mean_weeks}")