Skip to content

Single Month Feature Set

Module for the Time Related Features and Config classes.

Author: Bryce Senekal

TimeFeatureSet

Bases: FeatureSet[TimeFeaturesConfig]

A feature set for generating features based on the relationship between outlets and relative dates.

Source code in amee_utils/feature_generator/feature_set/time_related.py
class TimeFeatureSet(FeatureSet[TimeFeaturesConfig]):
    """A feature set for generating features based on the relationship between outlets and relative dates."""

    def calculate(
        self,
        df: DataFrame,
        dataset_config: DatasetConfig,
        feature_config: TimeFeaturesConfig,
        calculation_date: datetime,
    ) -> DataFrame:
        """
        Calculate the features for the given DataFrame, DatasetConfig, and FeatureConfig.

        Parameters
        ----------
        df : DataFrame
            The DataFrame to calculate features for.
        dataset_config : DatasetConfig
            The configuration for the dataset.
        feature_config : FeatureConfig
            The configuration for the features to generate.
        calculation_date : datetime
            Feature calculation date.

        Returns
        -------
        DataFrame
            A DataFrame with the calculated features.
        """
        keys = dataset_config.key_cols
        base = df.select(keys).distinct()

        filtered_df = self.filter_order_df(
            df=df,
            date_col=dataset_config.date_col,
            key_cols=keys,
            calc_date=calculation_date,
            months=feature_config.months_for_avg,
        )

        avg_order_time = self.time_btwn_orders(df=filtered_df, date_col=dataset_config.date_col, keys=keys)
        tenure = self.tenure(
            df=df,
            date_col=dataset_config.date_col,
            keys=keys,
            calc_date=calculation_date,
        )
        recency = self.time_since_last_order(
            df=df,
            date_col=dataset_config.date_col,
            keys=keys,
            calc_date=calculation_date,
        )

        return join_multiple_to_base(
            base_df=base,
            df_list=[
                avg_order_time,
                tenure,
                recency,
            ],
            key_cols=keys,
            join_type="left",
        )

    @staticmethod
    def filter_order_df(
        df: DataFrame,
        date_col: str,
        key_cols: list[str],
        calc_date: datetime,
        months: int,
    ) -> DataFrame:
        """Partition the DataFrame by the config key column(s) and then add Previous_Order by date.

        Previous_Order is used to filter to the last x months.

        Parameters
        ----------
        df : DataFrame
            The DataFrame to filter.
        date_col : str
            The date column in the DataFrame.
        key_cols : list[str]
            The key columns in the DataFrame.
        calc_date : datetime
            The calculation date.
        months : int
            The number of months to filter the data by.

        Returns
        -------
        DataFrame
            A DataFrame with the filtered data.
        """
        window = W.Window.partitionBy(key_cols).orderBy(date_col)
        return df.withColumn("Previous_Order", F.lag(F.col(date_col), 1).over(window)).filter(
            F.col(date_col) >= calc_date - relativedelta(months=months),
        )

    @staticmethod
    def time_btwn_orders(df: DataFrame, date_col: str, keys: list[str]) -> DataFrame:
        """Get the average time between each order per outlet over the last X months.

        Parameters
        ----------
        df : DataFrame
            The DataFrame to calculate the features for.
        date_col : str
            The date column in the DataFrame.
        keys : list[str]
            The key columns in the DataFrame.

        Returns
        -------
        DataFrame
            A DataFrame with the calculated features.
        """
        return (
            df.withColumn(
                "_Time_Difference",
                F.datediff(F.col(date_col), F.col("Previous_Order")),
            )
            .groupBy(keys)
            .agg(F.round(F.avg(F.col("_Time_Difference")), 2).alias("AVG_DAYS_BETWEEN_ORDERS"))
        )

    @staticmethod
    def tenure(df: DataFrame, date_col: str, keys: list[str], calc_date: datetime) -> DataFrame:
        """Get the time difference of calculation date to first invoice in the data per outlet.

        Parameters
        ----------
        df : DataFrame
            The DataFrame to calculate the features for.
        date_col : str
            The date column in the DataFrame.
        keys : list[str]
            The key columns in the DataFrame.
        calc_date : datetime
            The calculation date.

        Returns
        -------
        DataFrame
            A DataFrame with the calculated features.
        """
        return df.groupBy(keys).agg(
            F.datediff(
                F.lit(calc_date),
                F.min(F.col(date_col)),
            ).alias("TIME_SINCE_FIRST_ORDER")
        )

    @staticmethod
    def time_since_last_order(df, date_col: str, keys: list[str], calc_date: datetime) -> DataFrame:
        """Get the time difference of calculation date to last invoice in the data per outlet.

        Parameters
        ----------
        df : DataFrame
            The DataFrame to calculate the features for.
        date_col : str
            The date column in the DataFrame.
        keys : list[str]
            The key columns in the DataFrame.
        calc_date : datetime
            The calculation date.

        Returns
        -------
        DataFrame
            A DataFrame with the calculated features.
        """
        return df.groupBy(keys).agg(F.datediff(F.lit(calc_date), F.max(F.col(date_col))).alias("TIME_SINCE_LAST_ORDER"))

calculate(df, dataset_config, feature_config, calculation_date)

Calculate the features for the given DataFrame, DatasetConfig, and FeatureConfig.

Parameters:

Name Type Description Default
df DataFrame

The DataFrame to calculate features for.

required
dataset_config DatasetConfig

The configuration for the dataset.

required
feature_config FeatureConfig

The configuration for the features to generate.

required
calculation_date datetime

Feature calculation date.

required

Returns:

Type Description
DataFrame

A DataFrame with the calculated features.

Source code in amee_utils/feature_generator/feature_set/time_related.py
def calculate(
    self,
    df: DataFrame,
    dataset_config: DatasetConfig,
    feature_config: TimeFeaturesConfig,
    calculation_date: datetime,
) -> DataFrame:
    """
    Calculate the features for the given DataFrame, DatasetConfig, and FeatureConfig.

    Parameters
    ----------
    df : DataFrame
        The DataFrame to calculate features for.
    dataset_config : DatasetConfig
        The configuration for the dataset.
    feature_config : FeatureConfig
        The configuration for the features to generate.
    calculation_date : datetime
        Feature calculation date.

    Returns
    -------
    DataFrame
        A DataFrame with the calculated features.
    """
    keys = dataset_config.key_cols
    base = df.select(keys).distinct()

    filtered_df = self.filter_order_df(
        df=df,
        date_col=dataset_config.date_col,
        key_cols=keys,
        calc_date=calculation_date,
        months=feature_config.months_for_avg,
    )

    avg_order_time = self.time_btwn_orders(df=filtered_df, date_col=dataset_config.date_col, keys=keys)
    tenure = self.tenure(
        df=df,
        date_col=dataset_config.date_col,
        keys=keys,
        calc_date=calculation_date,
    )
    recency = self.time_since_last_order(
        df=df,
        date_col=dataset_config.date_col,
        keys=keys,
        calc_date=calculation_date,
    )

    return join_multiple_to_base(
        base_df=base,
        df_list=[
            avg_order_time,
            tenure,
            recency,
        ],
        key_cols=keys,
        join_type="left",
    )

filter_order_df(df, date_col, key_cols, calc_date, months) staticmethod

Partition the DataFrame by the config key column(s) and then add Previous_Order by date.

Previous_Order is used to filter to the last x months.

Parameters:

Name Type Description Default
df DataFrame

The DataFrame to filter.

required
date_col str

The date column in the DataFrame.

required
key_cols list[str]

The key columns in the DataFrame.

required
calc_date datetime

The calculation date.

required
months int

The number of months to filter the data by.

required

Returns:

Type Description
DataFrame

A DataFrame with the filtered data.

Source code in amee_utils/feature_generator/feature_set/time_related.py
@staticmethod
def filter_order_df(
    df: DataFrame,
    date_col: str,
    key_cols: list[str],
    calc_date: datetime,
    months: int,
) -> DataFrame:
    """Partition the DataFrame by the config key column(s) and then add Previous_Order by date.

    Previous_Order is used to filter to the last x months.

    Parameters
    ----------
    df : DataFrame
        The DataFrame to filter.
    date_col : str
        The date column in the DataFrame.
    key_cols : list[str]
        The key columns in the DataFrame.
    calc_date : datetime
        The calculation date.
    months : int
        The number of months to filter the data by.

    Returns
    -------
    DataFrame
        A DataFrame with the filtered data.
    """
    window = W.Window.partitionBy(key_cols).orderBy(date_col)
    return df.withColumn("Previous_Order", F.lag(F.col(date_col), 1).over(window)).filter(
        F.col(date_col) >= calc_date - relativedelta(months=months),
    )

tenure(df, date_col, keys, calc_date) staticmethod

Get the time difference of calculation date to first invoice in the data per outlet.

Parameters:

Name Type Description Default
df DataFrame

The DataFrame to calculate the features for.

required
date_col str

The date column in the DataFrame.

required
keys list[str]

The key columns in the DataFrame.

required
calc_date datetime

The calculation date.

required

Returns:

Type Description
DataFrame

A DataFrame with the calculated features.

Source code in amee_utils/feature_generator/feature_set/time_related.py
@staticmethod
def tenure(df: DataFrame, date_col: str, keys: list[str], calc_date: datetime) -> DataFrame:
    """Get the time difference of calculation date to first invoice in the data per outlet.

    Parameters
    ----------
    df : DataFrame
        The DataFrame to calculate the features for.
    date_col : str
        The date column in the DataFrame.
    keys : list[str]
        The key columns in the DataFrame.
    calc_date : datetime
        The calculation date.

    Returns
    -------
    DataFrame
        A DataFrame with the calculated features.
    """
    return df.groupBy(keys).agg(
        F.datediff(
            F.lit(calc_date),
            F.min(F.col(date_col)),
        ).alias("TIME_SINCE_FIRST_ORDER")
    )

time_btwn_orders(df, date_col, keys) staticmethod

Get the average time between each order per outlet over the last X months.

Parameters:

Name Type Description Default
df DataFrame

The DataFrame to calculate the features for.

required
date_col str

The date column in the DataFrame.

required
keys list[str]

The key columns in the DataFrame.

required

Returns:

Type Description
DataFrame

A DataFrame with the calculated features.

Source code in amee_utils/feature_generator/feature_set/time_related.py
@staticmethod
def time_btwn_orders(df: DataFrame, date_col: str, keys: list[str]) -> DataFrame:
    """Get the average time between each order per outlet over the last X months.

    Parameters
    ----------
    df : DataFrame
        The DataFrame to calculate the features for.
    date_col : str
        The date column in the DataFrame.
    keys : list[str]
        The key columns in the DataFrame.

    Returns
    -------
    DataFrame
        A DataFrame with the calculated features.
    """
    return (
        df.withColumn(
            "_Time_Difference",
            F.datediff(F.col(date_col), F.col("Previous_Order")),
        )
        .groupBy(keys)
        .agg(F.round(F.avg(F.col("_Time_Difference")), 2).alias("AVG_DAYS_BETWEEN_ORDERS"))
    )

time_since_last_order(df, date_col, keys, calc_date) staticmethod

Get the time difference of calculation date to last invoice in the data per outlet.

Parameters:

Name Type Description Default
df DataFrame

The DataFrame to calculate the features for.

required
date_col str

The date column in the DataFrame.

required
keys list[str]

The key columns in the DataFrame.

required
calc_date datetime

The calculation date.

required

Returns:

Type Description
DataFrame

A DataFrame with the calculated features.

Source code in amee_utils/feature_generator/feature_set/time_related.py
@staticmethod
def time_since_last_order(df, date_col: str, keys: list[str], calc_date: datetime) -> DataFrame:
    """Get the time difference of calculation date to last invoice in the data per outlet.

    Parameters
    ----------
    df : DataFrame
        The DataFrame to calculate the features for.
    date_col : str
        The date column in the DataFrame.
    keys : list[str]
        The key columns in the DataFrame.
    calc_date : datetime
        The calculation date.

    Returns
    -------
    DataFrame
        A DataFrame with the calculated features.
    """
    return df.groupBy(keys).agg(F.datediff(F.lit(calc_date), F.max(F.col(date_col))).alias("TIME_SINCE_LAST_ORDER"))

TimeFeaturesConfig

Bases: FeatureConfig

Configuration class for the Time Related features based on the past month of data.

Attributes:

Name Type Description
months_for_avg int

Number of months of data from the calculation_date to include for calculating averages.

Source code in amee_utils/feature_generator/feature_set/time_related.py
@attrs.define
class TimeFeaturesConfig(FeatureConfig):
    """
    Configuration class for the Time Related features based on the past month of data.

    Attributes
    ----------
    months_for_avg : int
        Number of months of data from the calculation_date to include for calculating averages.
    """

    months_for_avg: int = 3