Source code for humancompatible.detect.methods.msd.mapping_msd

import numpy as np
import pandas as pd
from typing import List, Tuple, Any



[docs]
def subgroup_map_from_conjuncts_binarized(
    conjuncts: List[int], X: np.ndarray[np.bool_]
) -> np.ndarray[np.bool_]:
    """
    Generates a boolean subgroup mapping based on the conjunction (AND) of specified features.

    This function creates a boolean array where each element is `True` only if the
    corresponding row in `X` has `True` values across all columns specified in `conjuncts`.
    Essentially, it identifies individuals who meet all criteria defined by the conjuncts.

    Args:
        conjuncts (List[int]): A list of integer indices (column indices) from the
            input array `X`. Each index represents a feature
            that must be `True` for an individual to be included
            in the subgroup.
        X (np.ndarray[`np.bool_`]): A 2D NumPy array of boolean values, where rows
            represent individuals and columns represent features.

    Returns:
        np.ndarray[`np.bool_`]: A 1D boolean NumPy array (`mapping`) of the same
            length as the number of rows in `X`. An element
            `mapping[i]` is `True` if `X[i, conj]` is `True` for
            all `conj` in `conjuncts`, and `False` otherwise.

    Raises:
        IndexError: If any index in `conjuncts` is out of bounds for the columns of `X`.

    Examples:
        >>> import numpy as np
        >>> X_data = np.array([
        ...     [True,  True,  False, True],   # Row 0
        ...     [True,  False, True,  True],   # Row 1
        ...     [False, True,  True,  False],  # Row 2
        ...     [True,  True,  True,  True]    # Row 3
        ... ])
        >>>
        >>> # Subgroup where feature at index 0 AND feature at index 1 are True
        >>> conjuncts_1 = [0, 1]
        >>> subgroup_map_from_conjuncts_binarized(conjuncts_1, X_data)
        array([ True, False, False,  True])
        >>> # Explanation: Only Row 0 and Row 3 have both X[:,0] and X[:,1] as True.

        >>> # Subgroup where feature at index 2 is True
        >>> conjuncts_2 = [2]
        >>> subgroup_map_from_conjuncts_binarized(conjuncts_2, X_data)
        array([False,  True,  True,  True])

        >>> # Subgroup where feature at index 0 AND feature at index 2 are True
        >>> conjuncts_3 = [0, 2]
        >>> subgroup_map_from_conjuncts_binarized(conjuncts_3, X_data)
        array([False,  True, False,  True])

        >>> # Test with an empty list of conjuncts (should return all True)
        >>> subgroup_map_from_conjuncts_binarized([], X_data)
        array([ True,  True,  True,  True])

        >>> # Test with an invalid conjunct index (will raise IndexError)
        >>> try:
        ...     subgroup_map_from_conjuncts_binarized([0, 99], X_data)
        ... except IndexError as e:
        ...     print(e)
        index 99 is out of bounds for axis 1 with size 4
    """
    # Initialize the mapping with all True values. This ensures that if conjuncts
    # is empty, all individuals are included (logical AND of no conditions is True).
    mapping = np.ones((X.shape[0],), dtype=bool)

    # Iterate through each specified conjunct (feature index)
    for conj in conjuncts:
        # Perform a logical AND operation between the current mapping and the
        # specified feature column. This filters down the subgroup.
        mapping &= X[:, conj]  # This will raise IndexError if `conj` is out of bounds
    return mapping




[docs]
def subgroup_map_from_conjuncts_dataframe(
    rule: List[Tuple[int, Any]], X: pd.DataFrame
) -> np.ndarray[np.bool_]:
    """
    Build a boolean mask for an MSD rule over a pandas DataFrame.

    Each (index, Bin) in *rule* comes from `detect_bias` or
    `detect_bias_two_samples`.  We ignore the positional index and
    use the Bin's `.feature.name`, so this is robust to column re-ordering.

    Args:
        rule (List[Tuple[int, Any]]): The rule identifying the subgroup, 
            as returned by `detect_bias(...)`.
        X (pd.DataFrame): The original (protected-only) DataFrame passed 
            to `detect_bias`. Must contain all columns named 
            in the rule's Bins.

    Returns:
        np.ndarray[`np.bool_`]: A 1-D boolean array where True marks rows 
            belonging to the subgroup.

    Raises:
        KeyError: If `X` is missing a column required by the rule.
    """
    mask = np.ones(len(X), dtype=bool)
    for _idx, binop in rule:
        feat = binop.feature.name
        if feat not in X.columns:
            raise KeyError(f"Column '{feat}' required by rule is missing.")
        col_values = X[feat].to_numpy()
        mask &= binop.evaluate(col_values)
    return mask