Source code for humancompatible.detect.methods.l_inf.l_inf

import logging
import numpy as np
from typing import Any
from .lp_tools import lin_prog_feas

from humancompatible.detect.binarizer import Binarizer

logger = logging.getLogger(__name__)


[docs] def check_l_inf_gap( X: np.ndarray, y: np.ndarray, binarizer: Binarizer, feature_involved: str, subgroup_to_check: Any, delta: float, verbose: int = 1, ) -> bool: """ Test whether a protected subgroup's outcome distribution differs from the overall population by **at most** `delta` in the l_inf-norm. Args: X (np.ndarray): Protected-attribute slice of the dataset (same rows as `y`). y (np.ndarray): Boolean target vector. binarizer (Binarizer): The binarizer used to encode `X` and `y`. feature_involved (str): Name of the protected column whose subgroup is tested. subgroup_to_check (Any): Raw value of the subgroup to isolate. delta (float): Threshold for the L-infinity norm. verbose (int, default 1): Verbosity level. 0 = silent, 1 = logger output only, 2 = all detailed logs (including solver output). Returns: bool: True if the subgroup histogram is within `delta`; False otherwise. Raises: ValueError: If `delta` is not positive. KeyError: If `feature_involved` is not in the binarizer's feature names. KeyError: If `subgroup_to_check` is not a valid value for the feature. """ if delta <= 0: raise ValueError("delta must be positive") if feature_involved not in binarizer.data_handler.feature_names: raise KeyError(f"Feature '{feature_involved}' not in protected set") X_bin = binarizer.data_handler.encode(X, one_hot=False) y_bin = binarizer.encode_y(y) feat_idx = binarizer.data_handler.feature_names.index(feature_involved) feature = binarizer.data_handler.features[feat_idx] try: subgroup_code = feature.value_mapping[subgroup_to_check] except KeyError as e: allowed = list(feature.value_mapping.keys()) raise KeyError(f"{subgroup_to_check!r} not a valid value " f"for '{feature_involved}'. Allowed: {allowed}") from e # Retain only the instances with a positive target outcome -> X_bin_pos X_bin_pos = X_bin[y_bin == 1] # Filter instances of the (potentially) discriminated subgroup -> discr discr = X_bin_pos[X_bin_pos[:, feat_idx] == subgroup_code] # List the feature types considered and remove the one under study d_h_cp: list[Feature] = binarizer.data_handler.features #<-DataHandlerCopy d_h_cp.pop(feat_idx) # Drop the feature being inspected as it is not required for # histogram construction X_bin_pos = np.delete(X_bin_pos, feat_idx, 1) discr = np.delete(discr, feat_idx, 1) # Create arrays with the dataset feature values (to create histograms) and # calculate apropriate binning on a per feature basis bins: list[int] = [] columns_all = np.empty(X_bin_pos.shape[0], ) columns_discr = np.empty(discr.shape[0], ) for i in range(X_bin_pos.shape[1]): # Set two bins for a binary feature if 'Binary' in str(d_h_cp[i].__class__): bins.append(2) # Set as many bins as the number of different encoded subgroups # for a categorical feature elif 'Categorical' in str(d_h_cp[i].__class__): bins.append(int(X_bin_pos[:, i].max() + 1)) # Use the Freedman Diaconis Estimator to bin continuous features elif 'Contiguous' in str(d_h_cp[i].__class__): bins.append(len(np.histogram_bin_edges(X_bin_pos[:,i],bins='fd'))-1) columns_all = np.vstack((columns_all, X_bin_pos[:, i])) columns_discr = np.vstack((columns_discr, discr[:, i])) columns_all = columns_all[1:, :] columns_discr = columns_discr[1:, :] # "Histogramisation" all_counts, _ = np.histogramdd(columns_all.T, bins=bins, density=False) discr_counts, _ = np.histogramdd(columns_discr.T, bins=bins, density=False) all_tot = all_counts.sum() discr_tot = discr_counts.sum() if all_tot == 0 or discr_tot == 0: raise ValueError("Zero total counts after filtering; cannot compute ℓ∞.") all_hist = all_counts / all_tot discr_hist = discr_counts / discr_tot # Reshaping dim = 1 for e in all_hist.shape: dim *= e all_rsh = all_hist.reshape(dim, 1) discr_rsh = discr_hist.reshape(dim, 1) status = lin_prog_feas(all_rsh, discr_rsh, delta=delta) is_within = bool(status == 0) # 0 = feasible if is_within: if verbose >= 1: logger.info(f"The most impacted subgroup bias <= {delta}") else: if verbose >= 1: logger.info(f"The most impacted subgroup bias > {delta}") return is_within