Source code for humancompatible.detect.helpers.prepare

import logging
import pandas as pd
import numpy as np
from typing import Any, Callable, Dict, List, Tuple

from humancompatible.detect.binarizer.Binarizer import Binarizer
from humancompatible.detect.data_handler import DataHandler

logger = logging.getLogger(__name__)


[docs] def prepare_dataset( input_data: pd.DataFrame, target_data: pd.DataFrame, n_max: int, protected_attrs: List[str], continuous_feats: List[str], feature_processing: Dict[str, Callable[[Any], int]], verbose: int = 1, ) -> Tuple[Binarizer, pd.DataFrame, pd.Series]: """ Prepares a dataset by cleaning, preprocessing, sampling, and structuring it for fairness analysis. This function performs several steps to get the data ready for further processing, especially focusing on handling missing values, applying feature transformations, managing feature types (continuous vs. categorical), sampling, and identifying protected attributes. Args: input_data (pd.DataFrame): The input features DataFrame. target_data (pd.DataFrame): Single-column target vector; same row count as `input_data`. n_max (int): The maximum number of samples to retain. If the dataset size exceeds this, it will be randomly downsampled. protected_attrs (List[str]): A list of column names that are considered protected attributes for fairness analysis. continuous_feats (List[str]): A list of column names identified as continuous features. feature_processing (Dict[str, Callable[[Any], int]]): Mapping from column name to a *callable* that converts each raw value to an integer. verbose (int, default 1): Verbosity level. 0 = silent, 1 = logger output only, 2 = all detailed logs (including solver output). Returns: Tuple[Binarizer, pd.DataFrame, pd.Series]: A tuple containing: - binarizer_protected (Binarizer): The protected-attributes binarizer. - input_data[protected_cols] (pd.DataFrame): The part of the data with protected attributes. - target_data (pd.Series): The corresponding target features. Notes: - Rows with any NaN values in `input_data` will be removed. - Features with only one unique value after NaN removal will be dropped. - The `target_data` is assumed to contain only one column and will be converted to a pandas Series for the output. - Requires `DataHandler` and `Binarizer` classes to be defined elsewhere for `dhandler_protected` and `binarizer_protected` to work correctly. """ mask_x = (~input_data.isnull().any(axis=1)).to_numpy() mask_y = (~target_data.isnull().any(axis=1)).to_numpy() mask = mask_x & mask_y if verbose >= 1: logger.debug(f"Removing {input_data.shape[0] - mask.sum()} rows with nans") input_data = input_data.loc[mask].copy() target_data = target_data.loc[mask].copy() # Preprocess the data for col, map_f in feature_processing.items(): if col in input_data.columns: input_data.loc[:, col] = input_data[col].map(map_f) values = {} bounds = {} for col in input_data.columns: vals = input_data[col].unique() if verbose >= 1: logger.debug(f"Feature {col} has {vals.shape[0]} values") if vals.shape[0] <= 1: input_data.drop(columns=[col], inplace=True) if verbose >= 1: logger.info( f"Feature {col} was removed due to having a single unique value" ) continue if col not in continuous_feats: values[col] = vals else: bounds[col] = (min(vals), max(vals)) n = input_data.shape[0] if n_max < n: samples = np.random.choice(n, size=n_max, replace=False) else: samples = np.random.permutation(n) input_data = input_data.iloc[samples] target_data = target_data[target_data.columns[0]].iloc[samples] protected_cols = [col for col in input_data.columns if col in protected_attrs] dhandler_protected = DataHandler.from_data( input_data[protected_cols], target_data, categ_map=values, bounds_map=bounds, ) binarizer_protected = Binarizer(dhandler_protected, target_positive_vals=[True]) return binarizer_protected, input_data[protected_cols], target_data