Skip to content

Feature Calculator

Calculation module for generating sets of features.

FeatureCalculator

Class to calculate features for a given dataset using a provided configuration.

Parameters:

Name Type Description Default
config Config

Configuration object containing dataset and feature set information.

required

Methods:

Name Description
run

Calculates features for a given dataset using the provided configuration.

Source code in amee_utils/feature_generator/calculator.py
class FeatureCalculator:
    """
    Class to calculate features for a given dataset using a provided configuration.

    Parameters
    ----------
    config : Config
        Configuration object containing dataset and feature set information.

    Methods
    -------
    run(df: DataFrame) -> DataFrame
        Calculates features for a given dataset using the provided configuration.

    """

    def __init__(self, config: Config) -> None:
        """
        Initialise a FeatureCalculator object.

        Parameters
        ----------
        config : Config
            Configuration object containing dataset and feature set information.

        Returns
        -------
        None

        """
        self.config = config

    def run(self, df: DataFrame, calculation_date: datetime) -> DataFrame:
        """
        Calculate features for a given dataset using the provided configuration.

        Parameters
        ----------
        df : DataFrame
            Input dataset to calculate features on.
        calculation_date : datetime
            Feature calculation date.
            Used to filter df to invoices before this date for feature calculations.

        Returns
        -------
        DataFrame
            Dataset with calculated features.

        """
        filtered_df = df.filter(F.col(self.config.dataset.date_col) < calculation_date)
        final_df = df.select(self.config.dataset.key_cols).distinct()
        for feature_set in self.config.features:
            feature_class_name = feature_set.__class__.__name__
            feature_class = FEATURE_CONFIG_MAP.get(feature_class_name)
            if feature_class:
                feature_instance = feature_class().calculate(  # type: ignore
                    df=filtered_df,
                    dataset_config=self.config.dataset,
                    feature_config=feature_set,
                    calculation_date=calculation_date,
                )
                final_df = final_df.join(feature_instance, on=self.config.dataset.key_cols, how="left")
        return final_df

run(df, calculation_date)

Calculate features for a given dataset using the provided configuration.

Parameters:

Name Type Description Default
df DataFrame

Input dataset to calculate features on.

required
calculation_date datetime

Feature calculation date. Used to filter df to invoices before this date for feature calculations.

required

Returns:

Type Description
DataFrame

Dataset with calculated features.

Source code in amee_utils/feature_generator/calculator.py
def run(self, df: DataFrame, calculation_date: datetime) -> DataFrame:
    """
    Calculate features for a given dataset using the provided configuration.

    Parameters
    ----------
    df : DataFrame
        Input dataset to calculate features on.
    calculation_date : datetime
        Feature calculation date.
        Used to filter df to invoices before this date for feature calculations.

    Returns
    -------
    DataFrame
        Dataset with calculated features.

    """
    filtered_df = df.filter(F.col(self.config.dataset.date_col) < calculation_date)
    final_df = df.select(self.config.dataset.key_cols).distinct()
    for feature_set in self.config.features:
        feature_class_name = feature_set.__class__.__name__
        feature_class = FEATURE_CONFIG_MAP.get(feature_class_name)
        if feature_class:
            feature_instance = feature_class().calculate(  # type: ignore
                df=filtered_df,
                dataset_config=self.config.dataset,
                feature_config=feature_set,
                calculation_date=calculation_date,
            )
            final_df = final_df.join(feature_instance, on=self.config.dataset.key_cols, how="left")
    return final_df