Skip to content

Single Month Feature Set

Module for the SingleMonthFeatureSet class and SingleMonthFeatureConfig class.

Author: Daniel Wertheimer

SingleMonthFeatureConfig

Bases: FeatureConfig

Configuration class for single-month feature calculation.

Attributes:

Name Type Description
lag_months list[int]

List of integers representing the number of months to lag. (default is [1, 3, 6])

count_columns list[str]

List of columns to perform count aggregation.

sum_columns list[str]

List of columns to perform sum aggregation.

mean_columns list[str]

List of columns to perform mean aggregation.

count_if_one_columns list[str]

List of columns to perform count-if-one aggregation.

calculate_percentage_change bool

Flag to determine if percentage change should be calculated.

resolve_divide_by_zero bool

Flag to determine if divide by zero errors should be resolved.

count_include_missing bool

Flag to determine if missing values should be included in the count.

Source code in amee_utils/feature_generator/feature_set/single_month.py
@attrs.define
class SingleMonthFeatureConfig(FeatureConfig):
    """
    Configuration class for single-month feature calculation.

    Attributes
    ----------
    lag_months : list[int]
        List of integers representing the number of months to lag. (default is [1, 3, 6])
    count_columns : list[str]
        List of columns to perform count aggregation.
    sum_columns : list[str]
        List of columns to perform sum aggregation.
    mean_columns : list[str]
        List of columns to perform mean aggregation.
    count_if_one_columns : list[str]
        List of columns to perform count-if-one aggregation.
    calculate_percentage_change : bool
        Flag to determine if percentage change should be calculated.
    resolve_divide_by_zero : bool
        Flag to determine if divide by zero errors should be resolved.
    count_include_missing : bool
        Flag to determine if missing values should be included in the count.
    """

    lag_months: list[int] = [1, 3, 6]
    count_columns: list[str] = []
    sum_columns: list[str] = []
    mean_columns: list[str] = []
    count_if_one_columns: list[str] = []
    calculate_percentage_change: bool = False
    resolve_divide_by_zero: bool = False
    count_includes_missing: bool = False

    def get_function_dict(self) -> dict[AggregationStrategy, list[str]]:
        """
        Get a dictionary mapping aggregation strategies to column lists.

        Returns
        -------
        dict[AggregationStrategy, list[str]]
            Dictionary mapping aggregation strategies to columns.
        """
        func_dict = {
            SumAggregation(): self.sum_columns,
            MeanAggregation(): self.mean_columns,
            CountAggregation(self.count_includes_missing): self.count_columns,
            CountIfOneAggregation(): self.count_if_one_columns,
        }
        return func_dict

get_function_dict()

Get a dictionary mapping aggregation strategies to column lists.

Returns:

Type Description
dict[AggregationStrategy, list[str]]

Dictionary mapping aggregation strategies to columns.

Source code in amee_utils/feature_generator/feature_set/single_month.py
def get_function_dict(self) -> dict[AggregationStrategy, list[str]]:
    """
    Get a dictionary mapping aggregation strategies to column lists.

    Returns
    -------
    dict[AggregationStrategy, list[str]]
        Dictionary mapping aggregation strategies to columns.
    """
    func_dict = {
        SumAggregation(): self.sum_columns,
        MeanAggregation(): self.mean_columns,
        CountAggregation(self.count_includes_missing): self.count_columns,
        CountIfOneAggregation(): self.count_if_one_columns,
    }
    return func_dict

SingleMonthFeatureSet

Bases: FeatureSet[SingleMonthFeatureConfig]

FeatureSet class for calculating single-month features.

Methods:

Name Description
calculate

Calculate the features based on the given configurations.

Source code in amee_utils/feature_generator/feature_set/single_month.py
class SingleMonthFeatureSet(FeatureSet[SingleMonthFeatureConfig]):
    """
    FeatureSet class for calculating single-month features.

    Methods
    -------
    calculate(df, dataset_config, feature_config, calculation_date) -> DataFrame
        Calculate the features based on the given configurations.
    """

    def calculate(
        self,
        df: DataFrame,
        dataset_config: DatasetConfig,
        feature_config: SingleMonthFeatureConfig,
        calculation_date: datetime,
    ) -> DataFrame:
        """
        Calculate single-month features.

        Parameters
        ----------
        df : DataFrame
            Input DataFrame.
        dataset_config : DatasetConfig
            Configuration for the dataset.
        feature_config : SingleMonthFeatureConfig
            Configuration for the feature calculation.
        calculation_date : datetime
            Date for which the calculation is performed.

        Returns
        -------
        DataFrame
            DataFrame with the calculated features.
        """
        df = create_month_date_col(df=df, date_col=dataset_config.date_col)
        final_df = df.select(dataset_config.key_cols).distinct()

        func_dict = feature_config.get_function_dict()

        for strategy, col_list in func_dict.items():
            if col_list:
                lagged_aggregation = LaggedAggregation(
                    periods_list=feature_config.lag_months,
                    time_col="month",
                    lag_type="single_month",
                )
                for col in col_list:
                    _tmp = lagged_aggregation.apply(df, col, dataset_config.key_cols, [strategy])
                    if feature_config.calculate_percentage_change:
                        _tmp = calculate_percentage_change(
                            df=_tmp,
                            key_cols=dataset_config.key_cols,
                            resolve_division_by_zero=feature_config.resolve_divide_by_zero,
                        )
                    final_df = final_df.join(_tmp, on=dataset_config.key_cols, how="left")

        return final_df

calculate(df, dataset_config, feature_config, calculation_date)

Calculate single-month features.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
dataset_config DatasetConfig

Configuration for the dataset.

required
feature_config SingleMonthFeatureConfig

Configuration for the feature calculation.

required
calculation_date datetime

Date for which the calculation is performed.

required

Returns:

Type Description
DataFrame

DataFrame with the calculated features.

Source code in amee_utils/feature_generator/feature_set/single_month.py
def calculate(
    self,
    df: DataFrame,
    dataset_config: DatasetConfig,
    feature_config: SingleMonthFeatureConfig,
    calculation_date: datetime,
) -> DataFrame:
    """
    Calculate single-month features.

    Parameters
    ----------
    df : DataFrame
        Input DataFrame.
    dataset_config : DatasetConfig
        Configuration for the dataset.
    feature_config : SingleMonthFeatureConfig
        Configuration for the feature calculation.
    calculation_date : datetime
        Date for which the calculation is performed.

    Returns
    -------
    DataFrame
        DataFrame with the calculated features.
    """
    df = create_month_date_col(df=df, date_col=dataset_config.date_col)
    final_df = df.select(dataset_config.key_cols).distinct()

    func_dict = feature_config.get_function_dict()

    for strategy, col_list in func_dict.items():
        if col_list:
            lagged_aggregation = LaggedAggregation(
                periods_list=feature_config.lag_months,
                time_col="month",
                lag_type="single_month",
            )
            for col in col_list:
                _tmp = lagged_aggregation.apply(df, col, dataset_config.key_cols, [strategy])
                if feature_config.calculate_percentage_change:
                    _tmp = calculate_percentage_change(
                        df=_tmp,
                        key_cols=dataset_config.key_cols,
                        resolve_division_by_zero=feature_config.resolve_divide_by_zero,
                    )
                final_df = final_df.join(_tmp, on=dataset_config.key_cols, how="left")

    return final_df