Skip to content

Column Utils

Utilities for working with column names.

LaggedColumnPairs dataclass

A dataclass to store information about a column and its lagged pairs.

Source code in amee_utils/feature_generator/feature_set/column_utils.py
@dataclass
class LaggedColumnPairs:
    """A dataclass to store information about a column and its lagged pairs."""

    column_name: str
    prefix: str
    sorted_pairs: Sequence[tuple[Any, Any]]

find_period(column_name)

Extract the period from a column name.

Parameters:

Name Type Description Default
column_name str

The name of the column to extract the period from.

required

Returns:

Type Description
int

The period extracted from the column name.

Source code in amee_utils/feature_generator/feature_set/column_utils.py
def find_period(column_name: str) -> int:
    """
    Extract the period from a column name.

    Parameters
    ----------
    column_name : str
        The name of the column to extract the period from.

    Returns
    -------
    int
        The period extracted from the column name.
    """
    match = re.search(r"\d+$", column_name)
    if match:
        return int(match.group())
    else:
        raise ValueError(f"Could not find period for column name: {column_name}")

find_prefix(column_name)

Find the prefix of a column name.

Parameters:

Name Type Description Default
column_name str

The name of the column.

required

Returns:

Type Description
str

The alphabetical characters at the beginning of the column name.

Source code in amee_utils/feature_generator/feature_set/column_utils.py
def find_prefix(column_name: str) -> str:
    """
    Find the prefix of a column name.

    Parameters
    ----------
    column_name : str
        The name of the column.

    Returns
    -------
    str
        The alphabetical characters at the beginning of the column name.
    """
    match = re.search(r"([A-Za-z]+)\d+$", column_name)
    if match:
        return match.group(1)
    else:
        raise ValueError(f"Could not find prefix for column name: {column_name}")

get_pairs_from_columns(column_names)

Generate pairs of lagged columns from a list of column names.

Parameters:

Name Type Description Default
column_names list of str

List of column names to generate pairs from.

required

Returns:

Type Description
list of LaggedColumnPairs

List of LaggedColumnPairs objects containing the column name, prefix, and sorted pairs.

Raises:

Type Description
ValueError

If a column name cannot be parsed.

Source code in amee_utils/feature_generator/feature_set/column_utils.py
def get_pairs_from_columns(column_names: list[str]) -> list[LaggedColumnPairs]:
    """
    Generate pairs of lagged columns from a list of column names.

    Parameters
    ----------
    column_names : list of str
        List of column names to generate pairs from.

    Returns
    -------
    list of LaggedColumnPairs
        List of LaggedColumnPairs objects containing the column name, prefix, and sorted pairs.

    Raises
    ------
    ValueError
        If a column name cannot be parsed.

    """
    column_info: DefaultColumnInfo = defaultdict(lambda: {"prefix": "", "periods": set()})

    for col in column_names:
        try:
            base_col = "_".join(col.split("_")[:-1])
            prefix = find_prefix(col)
            period = find_period(col)
            column_info[base_col]["prefix"] = prefix
            column_info[base_col]["periods"].add(period)
        except ValueError as e:
            raise ValueError(f"Error parsing column name '{col}': {str(e)}") from e

    result = []
    for col, info in column_info.items():
        periods = sorted(info["periods"])
        pairs = sorted([(y, x) for x, y in combinations(periods, 2)])
        result.append(
            LaggedColumnPairs(
                column_name=col,
                prefix=info["prefix"],
                sorted_pairs=pairs,
            )
        )

    return result