Source code for humancompatible.detect.data_handler.features.Mixed

from __future__ import annotations

from typing import Optional

import numpy as np
import pandas as pd

from ..types import CategValue, OneDimData

from .Categorical import Categorical
from .Contiguous import Contiguous
from .Feature import Feature, Monotonicity


[docs] class Mixed(Feature): # TODO: do it via mixins? def __init__( self, training_vals: OneDimData, categ_value_names: list[CategValue], map_to: Optional[list[float]] = None, name: Optional[str] = None, # TODO add the bounds parameter default_val: float = 0, monotone: Monotonicity = Monotonicity.NONE, modifiable: bool = True, ): raise NotImplementedError("Mixed Feature is not yet tested.") super().__init__(training_vals, name, monotone, modifiable) categ_mask = np.isin(training_vals, categ_value_names) self.__categ_value_names = categ_value_names if map_to is None: map_to = -np.arange(len(categ_value_names)) - 1 self.__categ_feat = Categorical( training_vals[categ_mask], categ_value_names, map_to, name, monotone, modifiable, ) self.__cont_feat = Contiguous( training_vals[~categ_mask], name, monotone, modifiable ) self.__default_val = default_val self._MAD = np.concatenate([self.__cont_feat.MAD, self.__categ_feat.MAD]) # TODO, optionally make them separate into 2 columns for not-ohe # TODO make that into a configurable default # TODO, optionally give the range of applicable values (also contiguous) # TODO somehow makes sure that the contiguous part is >= 0 @Feature._check_dims_on_encode def encode( self, vals: OneDimData, normalize: bool = True, one_hot: bool = True ) -> np.ndarray[np.float64]: dimension = (1 + self.__categ_feat.n_categorical_vals) if one_hot else 1 res = np.zeros( (vals.shape[0], dimension), dtype=np.float64, ) categ_mask = np.isin(vals, self.__categ_value_names) res[~categ_mask, 0] = self.__cont_feat.encode( vals[~categ_mask], normalize, one_hot ) if one_hot: res[categ_mask, 0] = self.__default_val res[categ_mask, 1:] = self.__categ_feat.encode( vals[categ_mask], normalize, one_hot ) else: res[categ_mask, 0] = self.__categ_feat.encode( vals[categ_mask], normalize, one_hot ) return res.astype(np.float64)
[docs] def decode( self, vals: np.ndarray[np.float64], denormalize: bool = True, return_series: bool = True, discretize: bool = False, ) -> OneDimData: is_one_hot = len(vals.shape) > 1 and vals.shape[1] > 1 res = np.empty((vals.shape[0],), dtype=object) if is_one_hot: categ_mask = vals[:, 1].astype(bool) for i in range(2, vals.shape[1]): categ_mask |= vals[:, i].astype(bool) res[categ_mask] = self.__categ_feat.decode( vals[:, 1:], denormalize, return_series=False, discretize=discretize )[categ_mask] cont_scope = vals[:, 0] else: categ_mask = np.isin(vals, list(self.__categ_feat.value_mapping.values())) res[categ_mask] = self.__categ_feat.decode( vals[categ_mask], denormalize, return_series=False, discretize=discretize, ) cont_scope = vals res[~categ_mask] = self.__cont_feat.decode( cont_scope, denormalize, return_series=False, discretize=discretize )[~categ_mask] if return_series: return pd.Series(res, name=self.name) return res
# TODO could be smarter, if this is an inner function, and these series wrappers are written only once... Or remove the series part altogether
[docs] def encoding_width(self, one_hot: bool) -> int: if one_hot: return 1 + self.__categ_feat.encoding_width(one_hot) return 1
@property def default_val(self): return self.__default_val @property def default_val_normalized(self): return self.__cont_feat.encode( np.array([self.__default_val]), normalize=True, one_hot=False )[0] @property def bounds(self): return self.__cont_feat.bounds @property def value_mapping(self): return self.__categ_feat.value_mapping @property def n_categorical_vals(self): return self.__categ_feat.n_categorical_vals @property def orig_vals(self): return self.__categ_feat.orig_vals @property def numeric_vals(self): return self.__categ_feat.numeric_vals