Source code for humancompatible.detect.data_handler.features.Mixed

from __future__ import annotations

from typing import Optional

import numpy as np
import pandas as pd

from ..types import CategValue, OneDimData

from .Categorical import Categorical
from .Contiguous import Contiguous
from .Feature import Feature, Monotonicity



[docs]
class Mixed(Feature):
    # TODO: do it via mixins?
    def __init__(
        self,
        training_vals: OneDimData,
        categ_value_names: list[CategValue],
        map_to: Optional[list[float]] = None,
        name: Optional[str] = None,
        # TODO add the bounds parameter
        default_val: float = 0,
        monotone: Monotonicity = Monotonicity.NONE,
        modifiable: bool = True,
    ):
        raise NotImplementedError("Mixed Feature is not yet tested.")
        super().__init__(training_vals, name, monotone, modifiable)
        categ_mask = np.isin(training_vals, categ_value_names)
        self.__categ_value_names = categ_value_names
        if map_to is None:
            map_to = -np.arange(len(categ_value_names)) - 1
        self.__categ_feat = Categorical(
            training_vals[categ_mask],
            categ_value_names,
            map_to,
            name,
            monotone,
            modifiable,
        )
        self.__cont_feat = Contiguous(
            training_vals[~categ_mask], name, monotone, modifiable
        )
        self.__default_val = default_val

        self._MAD = np.concatenate([self.__cont_feat.MAD, self.__categ_feat.MAD])
        # TODO, optionally make them separate into 2 columns for not-ohe
        # TODO make that into a configurable default
        # TODO, optionally give the range of applicable values (also contiguous)
        # TODO somehow makes sure that the contiguous part is >= 0

    @Feature._check_dims_on_encode
    def encode(
        self, vals: OneDimData, normalize: bool = True, one_hot: bool = True
    ) -> np.ndarray[np.float64]:
        dimension = (1 + self.__categ_feat.n_categorical_vals) if one_hot else 1
        res = np.zeros(
            (vals.shape[0], dimension),
            dtype=np.float64,
        )

        categ_mask = np.isin(vals, self.__categ_value_names)
        res[~categ_mask, 0] = self.__cont_feat.encode(
            vals[~categ_mask], normalize, one_hot
        )
        if one_hot:
            res[categ_mask, 0] = self.__default_val
            res[categ_mask, 1:] = self.__categ_feat.encode(
                vals[categ_mask], normalize, one_hot
            )
        else:
            res[categ_mask, 0] = self.__categ_feat.encode(
                vals[categ_mask], normalize, one_hot
            )
        return res.astype(np.float64)


[docs]
    def decode(
        self,
        vals: np.ndarray[np.float64],
        denormalize: bool = True,
        return_series: bool = True,
        discretize: bool = False,
    ) -> OneDimData:
        is_one_hot = len(vals.shape) > 1 and vals.shape[1] > 1

        res = np.empty((vals.shape[0],), dtype=object)
        if is_one_hot:
            categ_mask = vals[:, 1].astype(bool)
            for i in range(2, vals.shape[1]):
                categ_mask |= vals[:, i].astype(bool)
            res[categ_mask] = self.__categ_feat.decode(
                vals[:, 1:], denormalize, return_series=False, discretize=discretize
            )[categ_mask]
            cont_scope = vals[:, 0]
        else:
            categ_mask = np.isin(vals, list(self.__categ_feat.value_mapping.values()))
            res[categ_mask] = self.__categ_feat.decode(
                vals[categ_mask],
                denormalize,
                return_series=False,
                discretize=discretize,
            )
            cont_scope = vals
        res[~categ_mask] = self.__cont_feat.decode(
            cont_scope, denormalize, return_series=False, discretize=discretize
        )[~categ_mask]

        if return_series:
            return pd.Series(res, name=self.name)
        return res

        # TODO could be smarter, if this is an inner function, and these series wrappers are written only once... Or remove the series part altogether


[docs]
    def encoding_width(self, one_hot: bool) -> int:
        if one_hot:
            return 1 + self.__categ_feat.encoding_width(one_hot)
        return 1


    @property
    def default_val(self):
        return self.__default_val

    @property
    def default_val_normalized(self):
        return self.__cont_feat.encode(
            np.array([self.__default_val]), normalize=True, one_hot=False
        )[0]

    @property
    def bounds(self):
        return self.__cont_feat.bounds

    @property
    def value_mapping(self):
        return self.__categ_feat.value_mapping

    @property
    def n_categorical_vals(self):
        return self.__categ_feat.n_categorical_vals

    @property
    def orig_vals(self):
        return self.__categ_feat.orig_vals

    @property
    def numeric_vals(self):
        return self.__categ_feat.numeric_vals