Date Utils

Module to manage all date functions for feature generation.

`create_month_date_col(df, date_col)`

Create a new DataFrame with a month-based date column from an existing date column.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing the original date column.	required
`date_col`	`str`	The name of the column containing date values to be converted to a month-based format.	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with an additional 'month' column derived from the 'date_col'.

Note

This function converts the date values in the date_col to a month-based format by extracting the year and month from the original dates. The resulting DataFrame will include a new month column containing the month-based date values.

Example

>>> transformed_df = create_month_date_col(input_df, "original_date_column")
>>> transformed_df.show()

Source code in amee_utils/feature_generator/dateutils.py

def create_month_date_col(df: DataFrame, date_col: str) -> DataFrame:
    """
    Create a new DataFrame with a month-based date column from an existing date column.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing the original date column.
    date_col : str
        The name of the column containing date values to be converted to a month-based format.

    Returns
    -------
    DataFrame
        A new DataFrame with an additional 'month' column derived from the 'date_col'.

    Note
    ----
    This function converts the date values in the `date_col` to a month-based format by extracting
    the year and month from the original dates. The resulting DataFrame will include a new `month`
    column containing the month-based date values.

    Example
    -------
    ```python
    >>> transformed_df = create_month_date_col(input_df, "original_date_column")
    >>> transformed_df.show()
    ```
    """
    _df = df.withColumn(date_col, F.to_date(F.col(date_col), "yyyy-MM-dd")).withColumn(
        "month", F.trunc(F.col(date_col), "month")
    )

    if _df.filter(F.col("month").isNotNull()).count() == 0:
        raise ValueError(f"Column {date_col} does not contain a valid date.")
    return _df

`create_period_lag_df(df, lag_months, time_col)`

Create a filtered DataFrame representing a lag window of a specified duration in months.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing a date column.	required
`lag_months`	`int`	The number of months to include in the lag window.	required
`month_col`	`str`	The name of the column containing date values.	required

Returns:

Type	Description
`DataFrame`	A filtered DataFrame containing rows within the lag window.

Note

This function calculates the lag window by subtracting lag_months from the maximum date in the specified month_col of the input DataFrame. It then filters the DataFrame to include only rows with date values greater than or equal to the start of the lag window.

Example

>>> lagged_df = create_period_lag_df(input_df, 3, "date_column")
>>> lagged_df.show()

Source code in amee_utils/feature_generator/dateutils.py

def create_period_lag_df(
    df: DataFrame,
    lag_months: int,
    time_col: str,
):
    """
    Create a filtered DataFrame representing a lag window of a specified duration in months.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing a date column.
    lag_months : int
        The number of months to include in the lag window.
    month_col : str
        The name of the column containing date values.

    Returns
    -------
    DataFrame
        A filtered DataFrame containing rows within the lag window.

    Note
    ----
    This function calculates the lag window by subtracting `lag_months` from the maximum date
    in the specified `month_col` of the input DataFrame. It then filters the DataFrame to
    include only rows with date values greater than or equal to the start of the lag window.

    Example
    -------

    ```python
    >>> lagged_df = create_period_lag_df(input_df, 3, "date_column")
    >>> lagged_df.show()
    ```

    """
    max_date = df.agg({time_col: "max"}).collect()[0][0]
    min_date = max_date - relativedelta(months=lag_months - 1)

    return df.filter(F.col(time_col) >= min_date)

`create_period_week_lag_df(df, lag_weeks, time_col)`

Create a filtered DataFrame representing a lag window of a specified duration in weeks.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing a date column.	required
`lag_weeks`	`int`	The number of weeks to include in the lag window.	required
`time_col`	`str`	The name of the column containing date values.	required

Returns:

Type	Description
`DataFrame`	A filtered DataFrame containing rows within the lag window.

Note

This function calculates the lag window by subtracting lag_weeks from the maximum date in the specified time_col of the input DataFrame. It then filters the DataFrame to include only rows with date values greater than or equal to the start of the lag window.

Example

>>> lagged_df = create_period_week_lag_df(input_df, 4, "week_column")
>>> lagged_df.show()

Source code in amee_utils/feature_generator/dateutils.py

def create_period_week_lag_df(
    df: DataFrame,
    lag_weeks: int,
    time_col: str,
):
    """
    Create a filtered DataFrame representing a lag window of a specified duration in weeks.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing a date column.
    lag_weeks : int
        The number of weeks to include in the lag window.
    time_col : str
        The name of the column containing date values.

    Returns
    -------
    DataFrame
        A filtered DataFrame containing rows within the lag window.

    Note
    ----
    This function calculates the lag window by subtracting `lag_weeks` from the maximum date
    in the specified `time_col` of the input DataFrame. It then filters the DataFrame to
    include only rows with date values greater than or equal to the start of the lag window.

    Example
    -------
    ```python
    >>> lagged_df = create_period_week_lag_df(input_df, 4, "week_column")
    >>> lagged_df.show()
    ```
    """
    max_date = df.agg({time_col: "max"}).collect()[0][0]
    min_date = max_date - timedelta(weeks=lag_weeks - 1)

    return df.filter(F.col(time_col) >= min_date)

`create_single_month_lag_df(df, lag_months, time_col)`

Create a filtered DataFrame representing a lag window of specified duration in months.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing a date column.	required
`lag_months`	`int`	The number of months to lag.	required

Returns:

Type	Description
`DataFrame`	A filtered DataFrame containing rows within the lag window.

Note

This function calculates the lag window by subtracting lag_months from the maximum date in the month column of the input DataFrame. It then filters the DataFrame to include only rows within this lag window.

Example

>>> lagged_df = create_single_month_lag_df(input_df, 3)
>>> lagged_df.show()

Source code in amee_utils/feature_generator/dateutils.py

def create_single_month_lag_df(
    df: DataFrame,
    lag_months: int,
    time_col: str,
):
    """
    Create a filtered DataFrame representing a lag window of specified duration in months.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing a date column.
    lag_months : int
        The number of months to lag.

    Returns
    -------
    DataFrame
        A filtered DataFrame containing rows within the lag window.

    Note
    ----
    This function calculates the lag window by subtracting `lag_months` from the maximum date
    in the `month` column of the input DataFrame. It then filters the DataFrame to
    include only rows within this lag window.

    Example
    -------

    ```python
    >>> lagged_df = create_single_month_lag_df(input_df, 3)
    >>> lagged_df.show()
    ```

    """
    max_date = df.agg({time_col: "max"}).collect()[0][0]
    calc_month = max_date - relativedelta(months=lag_months - 1)

    windowed_df = df.filter((F.col(time_col) == calc_month))

    return windowed_df

`create_single_week_lag_df(df, lag_weeks, time_col)`

Create a filtered DataFrame representing a lag window of specified duration in weeks.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing a date column.	required
`lag_weeks`	`int`	The number of weeks to lag.	required
`time_col`	`str`	Column name representing the week in the DataFrame.	required

Returns:

Type	Description
`DataFrame`	A filtered DataFrame containing rows within the lag window.

Note

This function calculates the lag window by subtracting lag_weeks from the maximum date in the time_col column of the input DataFrame. It then filters the DataFrame to include only rows within this lag window.

Example

>>> lagged_df = create_single_week_lag_df(input_df, 2, "week")
>>> lagged_df.show()

Source code in amee_utils/feature_generator/dateutils.py

def create_single_week_lag_df(
    df: DataFrame,
    lag_weeks: int,
    time_col: str,
):
    """
    Create a filtered DataFrame representing a lag window of specified duration in weeks.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing a date column.
    lag_weeks : int
        The number of weeks to lag.
    time_col : str
        Column name representing the week in the DataFrame.

    Returns
    -------
    DataFrame
        A filtered DataFrame containing rows within the lag window.

    Note
    ----
    This function calculates the lag window by subtracting `lag_weeks` from the maximum date
    in the `time_col` column of the input DataFrame. It then filters the DataFrame to
    include only rows within this lag window.

    Example
    -------
    ```python
    >>> lagged_df = create_single_week_lag_df(input_df, 2, "week")
    >>> lagged_df.show()
    ```
    """
    max_date = df.agg({time_col: "max"}).collect()[0][0]
    calc_week = max_date - timedelta(weeks=lag_weeks - 1)

    windowed_df = df.filter((F.col(time_col) == calc_week))

    return windowed_df

`create_week_date_col(df, date_col)`

Create a new DataFrame with a week-based date column from an existing date column.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing the original date column.	required
`date_col`	`str`	The name of the column containing date values to be converted to a week-based format.	required

Returns:

Type	Description
`DataFrame`	A new DataFrame with an additional 'week' column derived from the 'date_col'.

Note

This function converts the date values in the date_col to a week-based format by extracting the week start date (Monday) from the original dates. The resulting DataFrame will include a new week column containing the week-based date values using ISO week format.

Example

>>> transformed_df = create_week_date_col(input_df, "original_date_column")
>>> transformed_df.show()

Source code in amee_utils/feature_generator/dateutils.py

def create_week_date_col(df: DataFrame, date_col: str) -> DataFrame:
    """
    Create a new DataFrame with a week-based date column from an existing date column.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing the original date column.
    date_col : str
        The name of the column containing date values to be converted to a week-based format.

    Returns
    -------
    DataFrame
        A new DataFrame with an additional 'week' column derived from the 'date_col'.

    Note
    ----
    This function converts the date values in the `date_col` to a week-based format by extracting
    the week start date (Monday) from the original dates. The resulting DataFrame will include a new
    `week` column containing the week-based date values using ISO week format.

    Example
    -------
    ```python
    >>> transformed_df = create_week_date_col(input_df, "original_date_column")
    >>> transformed_df.show()
    ```
    """
    _df = df.withColumn(date_col, F.to_date(F.col(date_col), "yyyy-MM-dd")).withColumn(
        "week", F.date_trunc("week", F.col(date_col))
    )

    if _df.filter(F.col("week").isNotNull()).count() == 0:
        raise ValueError(f"Column {date_col} does not contain a valid date.")
    return _df

`filter_data_by_months(df, date_column, number_of_months, calculation_date)`

Filter DataFrame to include only the rows within the specified number of months up to the calculation date.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing the data.	required
`date_column`	`str`	The name of the column containing the dates.	required
`number_of_months`	`int`	The number of months to go back from the calculation date.	required
`calculation_date`	`datetime`	The date for which the calculations are performed.	required

Returns:

Type	Description
`DataFrame`	The filtered DataFrame.

Example

Input DataFrame:

customer_id	sku_id	quantity	date
C1	SKU1	10	2024-03-01
C1	SKU2	5	2024-04-01
C1	SKU3	3	2024-05-01
C1	SKU4	8	2024-06-01
C1	SKU5	1	2024-07-01
C2	SKU1	2	2024-03-01
C2	SKU2	7	2024-04-01
C2	SKU3	6	2024-05-01
C2	SKU4	4	2024-06-01
C2	SKU5	9	2024-07-01

Output DataFrame (assuming calculation_date is 2024-07-01 and number_of_months is 3):

customer_id	sku_id	quantity	date
C1	SKU2	5	2024-04-01
C1	SKU3	3	2024-05-01
C1	SKU4	8	2024-06-01
C2	SKU2	7	2024-04-01
C2	SKU3	6	2024-05-01
C2	SKU4	4	2024-06-01

Source code in amee_utils/feature_generator/dateutils.py

def filter_data_by_months(
    df: DataFrame,
    date_column: str,
    number_of_months: int,
    calculation_date: datetime,
) -> DataFrame:
    """
    Filter DataFrame to include only the rows within the specified number of months up to the calculation date.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing the data.
    date_column : str
        The name of the column containing the dates.
    number_of_months : int
        The number of months to go back from the calculation date.
    calculation_date : datetime
        The date for which the calculations are performed.

    Returns
    -------
    DataFrame
        The filtered DataFrame.

    Example
    -------

    Input DataFrame:

    | customer_id | sku_id | quantity | date       |
    |-------------|--------|----------|------------|
    | C1          | SKU1   | 10       | 2024-03-01 |
    | C1          | SKU2   | 5        | 2024-04-01 |
    | C1          | SKU3   | 3        | 2024-05-01 |
    | C1          | SKU4   | 8        | 2024-06-01 |
    | C1          | SKU5   | 1        | 2024-07-01 |
    | C2          | SKU1   | 2        | 2024-03-01 |
    | C2          | SKU2   | 7        | 2024-04-01 |
    | C2          | SKU3   | 6        | 2024-05-01 |
    | C2          | SKU4   | 4        | 2024-06-01 |
    | C2          | SKU5   | 9        | 2024-07-01 |

    Output DataFrame (assuming calculation_date is 2024-07-01 and number_of_months is 3):

    | customer_id | sku_id | quantity | date       |
    |-------------|--------|----------|------------|
    | C1          | SKU2   | 5        | 2024-04-01 |
    | C1          | SKU3   | 3        | 2024-05-01 |
    | C1          | SKU4   | 8        | 2024-06-01 |
    | C2          | SKU2   | 7        | 2024-04-01 |
    | C2          | SKU3   | 6        | 2024-05-01 |
    | C2          | SKU4   | 4        | 2024-06-01 |

    """
    df = create_month_date_col(df=df, date_col=date_column)
    end_date = calculation_date.replace(day=1) - timedelta(days=1)
    start_date = (end_date - relativedelta(months=number_of_months - 1)).replace(day=1)

    df_filtered = df.filter((F.col(date_column) >= start_date) & (F.col(date_column) <= end_date))

    return df_filtered

`filter_data_by_weeks(df, date_column, number_of_weeks, calculation_date)`

Filter DataFrame to include only the rows within the specified number of complete weeks prior to the calculation date.

This function excludes the current week (the week containing the calculation_date) to ensure only full historical weeks are included. This provides consistent weekly windows regardless of which day of the week the calculation_date falls on.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The input DataFrame containing the data.	required
`date_column`	`str`	The name of the column containing the dates.	required
`number_of_weeks`	`int`	The number of complete weeks to include, counting backwards from the week before the calculation_date's week.	required
`calculation_date`	`datetime`	The date for which the calculations are performed. The week containing this date will be excluded to ensure only complete weeks are included.	required

Returns:

Type	Description
`DataFrame`	The filtered DataFrame containing only rows from complete historical weeks.

Example

Input DataFrame:

customer_id	sku_id	quantity	date
C1	SKU1	10	2024-03-01
C1	SKU2	5	2024-03-08
C1	SKU3	3	2024-03-15
C1	SKU4	8	2024-03-22
C1	SKU5	1	2024-03-29

Output DataFrame (assuming calculation_date is 2024-03-29 and number_of_weeks is 2):

customer_id	sku_id	quantity	date
C1	SKU2	5	2024-03-08
C1	SKU3	3	2024-03-15

Source code in amee_utils/feature_generator/dateutils.py

def filter_data_by_weeks(
    df: DataFrame,
    date_column: str,
    number_of_weeks: int,
    calculation_date: datetime,
) -> DataFrame:
    """
    Filter DataFrame to include only the rows within the specified number of complete weeks
    prior to the calculation date.

    This function excludes the current week (the week containing the calculation_date) to
    ensure only full historical weeks are included. This provides consistent weekly windows
    regardless of which day of the week the calculation_date falls on.

    Parameters
    ----------
    df : DataFrame
        The input DataFrame containing the data.
    date_column : str
        The name of the column containing the dates.
    number_of_weeks : int
        The number of complete weeks to include, counting backwards from the week
        before the calculation_date's week.
    calculation_date : datetime
        The date for which the calculations are performed. The week containing
        this date will be excluded to ensure only complete weeks are included.

    Returns
    -------
    DataFrame
        The filtered DataFrame containing only rows from complete historical weeks.

    Example
    -------
    Input DataFrame:

    | customer_id | sku_id | quantity | date       |
    |-------------|--------|----------|------------|
    | C1          | SKU1   | 10       | 2024-03-01 |
    | C1          | SKU2   | 5        | 2024-03-08 |
    | C1          | SKU3   | 3        | 2024-03-15 |
    | C1          | SKU4   | 8        | 2024-03-22 |
    | C1          | SKU5   | 1        | 2024-03-29 |

    Output DataFrame (assuming calculation_date is 2024-03-29 and number_of_weeks is 2):

    | customer_id | sku_id | quantity | date       |
    |-------------|--------|----------|------------|
    | C1          | SKU2   | 5        | 2024-03-08 |
    | C1          | SKU3   | 3        | 2024-03-15 |
    """
    df = create_week_date_col(df=df, date_col=date_column)

    # We want to include full weeks (Monday to Sunday) ending just before the current week.
    # For example, if calculation_date is a Friday, we ignore the current week.
    # This removes partial weeks.
    calc_week_start = calculation_date - timedelta(days=calculation_date.weekday())
    end_date = calc_week_start - timedelta(days=1)  # make it Sunday
    start_date = calc_week_start - timedelta(weeks=number_of_weeks)

    df_filtered = df.filter((F.col(date_column) >= start_date) & (F.col(date_column) <= end_date))

    return df_filtered