Source code for sklearn_nominal.sklearn.nominal_model

import abc

import numpy as np
import pandas as pd
import scipy
from pandas import DataFrame
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.calibration import LabelEncoder
from sklearn.exceptions import NotFittedError
from sklearn.metrics import accuracy_score
from sklearn.utils import compute_class_weight, validation
from sklearn.utils._tags import (
    ClassifierTags,  # ty:ignore[unresolved-import]
    RegressorTags,  # ty:ignore[unresolved-import]
    Tags,  # ty:ignore[unresolved-import]
    TargetTags,  # ty:ignore[unresolved-import]
    TransformerTags,  # ty:ignore[unresolved-import]
    get_tags,  # ty:ignore[unresolved-import]
)
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import _check_y, validate_data  # ty:ignore[unresolved-import]

from .. import shared, tree
from ..backend import Input, Output
from ..backend.core import Dataset, Model
from ..backend.factory import make_dataset
from ..backend.pandas import PandasDataset
from ..shared.target_error import TargetError


def atleast_2d(x):
    """View input as array with at least two dimensions.

    Args:
        x (array_like): Input data.

    Returns:
        ndarray: An array with at least two dimensions.
    """
    x = np.asanyarray(x)  # ty:ignore[unresolved-attribute]
    if x.ndim == 0:
        result = x.reshape(1, 1)
    elif x.ndim == 1:
        result = x[:, np.newaxis]
    else:
        result = x
    return result


# This is a mixin that must be used with sklearn estimators

[docs]
class NominalModel(metaclass=abc.ABCMeta):
    """Mixin class for all nominal models in sklearn_nominal.

    This mixin provides the foundational infrastructure for models that natively
    handle nominal (categorical) attributes. It abstracts the complexities of
    managing different computation backends and provides a bridge between the
    scikit-learn API and the library's internal core logic.

    Architectural Context
    ---------------------
    `NominalModel` serves as the primary interface for backend abstraction. It
    manages the `model_` attribute, which stores the internal representation of
    the fitted model (from `sklearn_nominal.backend.core.Model`). It also
    handles the preservation of data types (dtypes) which is crucial for
    maintaining the nominal nature of features throughout the pipeline.

    Examples
    --------
    >>> from sklearn_nominal.sklearn.nominal_model import NominalModel
    >>> from sklearn.base import BaseEstimator
    >>> class MyNominalModel(NominalModel, BaseEstimator):
    ...     def __init__(self, backend='pandas'):
    ...         super().__init__(backend=backend)
    ...     def fit(self, X, y):
    ...         # implementation here
    ...         return self

    Attributes
    ----------
    backend : str
        The backend to use for computations (e.g., "pandas").
    model_ : sklearn_nominal.backend.core.Model
        The underlying fitted model object from the backend.
    is_fitted_ : bool
        Indicates whether the model has been successfully fitted.
    dtypes_ : pd.Series or list
        The data types of the features as observed during `fit`.
    """

    check_parameters = {"dtype": None}

    def __init__(self, backend: str = "pandas", *args, **kwargs):
        """Initializes the nominal model.

        Args:
            backend (str): The backend to use for computations. Defaults to "pandas".
            *args: Additional positional arguments for the parent class.
            **kwargs: Additional keyword arguments for the parent class.
        """
        super().__init__(*args, **kwargs)
        self.backend = backend


[docs]
    def complexity(self):
        """Returns the complexity of the fitted model.

        The definition of complexity is backend and model dependent. For trees,
        it typically represents the number of nodes.

        Returns
        -------
        int or float
            The complexity metric of the model.

        Raises
        ------
        NotFittedError
            If the model has not been fitted yet.
        """
        self.check_is_fitted()
        return self.model_.complexity()



[docs]
    def set_sklearn_tags(self, tags):
        """Sets scikit-learn tags for the nominal model.

        Configures the estimator tags to accurately reflect its capabilities,
        specifically its ability to handle string inputs and missing values
        natively.

        Parameters
        ----------
        tags : Tags
            The scikit-learn tags object to be modified in-place.
        """
        tags.non_deterministic = False
        tags.input_tags.sparse = False
        tags.input_tags.allow_nan = True
        tags.input_tags.string = True



[docs]
    def pretty_print(self, class_names: list[str] | None = None):
        """Returns a string representation of the fitted model.

        Delegates the visualization logic to the underlying backend model.

        Parameters
        ----------
        class_names : list of str, optional
            Names of the classes to use in the output. If None, default
            identifiers are used.

        Returns
        -------
        str
            A human-readable representation of the model.

        Raises
        ------
        NotFittedError
            If the model has not been fitted yet.
        """
        self.check_is_fitted()
        return self.model_.pretty_print(class_names=class_names)



[docs]
    def check_is_fitted(self):
        """Checks if the model has been fitted.

        Raises
        ------
        NotFittedError
            If the `is_fitted_` attribute is not set or is False.
        """
        if not hasattr(self, "is_fitted_") or not self.is_fitted_:
            raise NotFittedError()



[docs]
    def get_dtypes(self, x):
        """Extracts and maps data types from the input.

        This method identifies the data types of the input features to ensure
        they are correctly handled by the backend.

        Parameters
        ----------
        x : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input data.

        Returns
        -------
        dict or None
            A dictionary mapping column names to data types if `x` is a
            DataFrame, otherwise None.
        """
        if isinstance(x, pd.DataFrame):
            dtypes = x.dtypes.to_dict()
        else:
            dtypes = None
        return dtypes



[docs]
    def get_feature_names(self):
        """Returns the names of the features seen during fit.

        Returns
        -------
        ndarray of str or None
            The feature names, or None if they were not available during fit
            (e.g., if input was a numpy array).
        """
        if not hasattr(self, "feature_names_in_"):
            return None
        else:
            return self.feature_names_in_



[docs]
    def set_dtypes(self, x):
        """Sets and persists the data types based on the input.

        This is called during `fit` to ensure that subsequent calls to
        `predict` can cast the input data to the same types, preserving
        nominal/numeric distinctions.

        Parameters
        ----------
        x : {pd.DataFrame, np.ndarray, sparse matrix}
            The input data to extract types from.

        Raises
        ------
        ValueError
            If the input type is not supported or if the input is not 2D.
        """
        if isinstance(x, pd.DataFrame):
            self.dtypes_ = x.dtypes
        elif isinstance(x, np.ndarray) or scipy.sparse.issparse(x):
            if len(x.shape) != 2:
                raise ValueError(f"Expected 2d input, actual shape {x.shape}")
            self.dtypes_ = [x.dtype] * x.shape[1]
        else:
            raise ValueError(f"Only pd.Dataframe or np.ndarray supported, received: {x}")



[docs]
    def set_model(self, model):
        """Sets the underlying backend model and marks it as fitted.

        Parameters
        ----------
        model : sklearn_nominal.backend.core.Model
            The trained model instance from the backend.
        """
        self.model_: Model = model
        self.is_fitted_ = True




class NominalUnsupervisedModel(NominalModel):
    """Base class for unsupervised nominal models.

    Extends `NominalModel` to configure tags for unsupervised tasks.
    """

    def set_sklearn_tags(self, tags):
        """Sets scikit-learn tags for the unsupervised nominal model.

        Parameters
        ----------
        tags : Tags
            The scikit-learn tags object to be modified.

        Returns
        -------
        Tags
            The modified tags object.
        """
        super().set_sklearn_tags(tags)
        tags.target_tags.single_output = False
        tags.target_tags.required = False
        return tags


class NominalSupervisedModel(NominalModel):
    """Base class for supervised nominal models.

    Extends `NominalModel` to provide shared validation logic for supervised
    learning tasks.
    """

    def set_sklearn_tags(self, tags):
        """Sets scikit-learn tags for the supervised nominal model.

        Parameters
        ----------
        tags : Tags
            The scikit-learn tags object to be modified.
        """
        super().set_sklearn_tags(tags)
        tags.target_tags.single_output = False
        tags.target_tags.required = True

    def validate_data_predict(self, x):
        """Validates and prepares input data for prediction.

        This method ensures the input features match the structure seen during
        training, handles feature name alignment, and restores data types.

        Parameters
        ----------
        x : array-like of shape (n_samples, n_features)
            The input data to validate.

        Returns
        -------
        pd.DataFrame
            The validated data as a pandas DataFrame, with dtypes restored to
            match those observed during training.

        Raises
        ------
        NotFittedError
            If the model has not been fitted yet.
        ValueError
            If the input contains no samples or has inconsistent features.
        """
        dtypes = self.get_dtypes(x)
        self.check_is_fitted()
        x = validate_data(
            self,
            x,
            reset=False,
            dtype=None,
            ensure_all_finite=False,
            accept_sparse=False,
        )
        n = len(x)
        if n == 0:
            raise ValueError(f"Input contains 0 samples.")
        df = pd.DataFrame(x, columns=self.get_feature_names())
        if dtypes is not None:
            df = df.astype(dtypes)
        return df



[docs]
class NominalClassifier(NominalSupervisedModel, ClassifierMixin):
    """Base class for nominal classifiers.

    This class coordinates the end-to-end classification workflow, including
    target encoding, class weight computation, and delegation to backend
    trainers.

    Architectural Context
    ---------------------
    `NominalClassifier` implements the standard scikit-learn `fit`/`predict`
    cycle while delegating the actual training logic to an internal "trainer"
    (created via `make_model`). It handles the transformation of target
    labels using `LabelEncoder` to ensure the backend receives integer-encoded
    classes, while providing inverse transformations for user-facing output.

    Examples
    --------
    >>> from sklearn_nominal.sklearn.nominal_model import NominalClassifier
    >>> class MyClassifier(NominalClassifier):
    ...     def make_model(self, d, class_weight):
    ...         # Return a backend-specific trainer
    ...         pass

    Attributes
    ----------
    class_weight : dict, list of dicts or "balanced", default=None
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one.
    classes_ : ndarray of shape (n_classes,)
        The unique class labels observed during `fit`.
    le_ : sklearn.preprocessing.LabelEncoder
        The encoder used to map class labels to internal integer indices.
    """

    def __init__(self, class_weight=None, *args, **kwargs):
        """Initializes the nominal classifier.

        Parameters
        ----------
        class_weight : dict, list of dicts or "balanced", optional
            Weights associated with classes. Defaults to None.
        *args : list
            Additional positional arguments for the parent class.
        **kwargs : dict
            Additional keyword arguments for the parent class.
        """
        super().__init__(*args, **kwargs)

        self.class_weight = class_weight

    def __sklearn_tags__(self):
        """Returns the scikit-learn tags for the classifier.

        Returns
        -------
        Tags
            The scikit-learn tags configured for a multi-class classifier.
        """
        tags = super().__sklearn_tags__()  # ty:ignore[unresolved-attribute]
        self.set_sklearn_tags(tags)
        tags.classifier_tags.multi_class = True
        return tags


[docs]
    def validate_data_fit_classification(self, x, y) -> tuple[Dataset, np.ndarray]:
        """Validates and transforms data for classification fitting.

        This method performs the following transformations:
        1. Validates `x` and `y` using scikit-learn's `validate_data`.
        2. Determines the unique classes and stores them in `classes_`.
        3. Encodes `y` using `LabelEncoder`.
        4. Calculates class weights based on the `class_weight` parameter.
        5. Packages features and the encoded target into a backend-specific
           `Dataset` (e.g., `PandasDataset`).

        Parameters
        ----------
        x : array-like of shape (n_samples, n_features)
            The input features.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target labels.

        Returns
        -------
        tuple
            A tuple containing:
            - Dataset : The backend-specific dataset object.
            - np.ndarray : The computed class weights for each class in `classes_`.

        Raises
        ------
        ValueError
            If `y` contains only one unique class.
        """
        check_classification_targets(y)
        dtypes = self.get_dtypes(x)
        x, y = validate_data(
            self,
            x,
            y,
            reset=True,
            multi_output=True,
            y_numeric=False,
            ensure_all_finite=False,
            dtype=None,
            accept_sparse=False,
        )
        self.classes_ = np.unique(y)
        if len(self.classes_) < 2:
            raise ValueError("Can't train classifier with one class.")
        # dtype = x_original.dtype
        class_weight = self.get_class_weights(y)
        dataset = make_dataset(self.backend, x, self.get_y(y), self.get_feature_names(), dtypes)
        return dataset, class_weight



[docs]
    def get_y(self, y):
        """Validates and encodes the target labels.

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            The target labels to encode.

        Returns
        -------
        ndarray
            The integer-encoded target labels.
        """
        y = _check_y(y, multi_output=True, y_numeric=False, estimator=self)
        # TODO make pure numpy
        self.le_ = LabelEncoder()
        y = self.le_.fit_transform(y)
        return y



[docs]
    @abc.abstractmethod
    def make_model(self, d: Dataset, class_weight: np.ndarray):
        """Abstract method to create the model trainer.

        Implementation Guidance
        -----------------------
        Subclasses must implement this method to return a trainer object that
        follows the internal library API (specifically, it should have a
        `.fit(dataset)` method).

        For example, a decision tree classifier would return a
        `sklearn_nominal.tree.trainer.TreeTrainer` instance configured with
        the appropriate split criteria and pruning parameters.

        Parameters
        ----------
        d : Dataset
            The training dataset prepared by `validate_data_fit_classification`.
        class_weight : np.ndarray
            The class weights computed during validation.

        Returns
        -------
        trainer : object
            A trainer instance capable of fitting the provided dataset.
        """
        pass



[docs]
    def fit(self, x: Input, y: Output):
        """Fits the nominal classifier.

        Parameters
        ----------
        x : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples.
        y : array-like of shape (n_samples,)
            The target values (class labels).

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        d, class_weight = self.validate_data_fit_classification(x, y)

        trainer = self.make_model(d, class_weight)
        model = trainer.fit(d)
        self.set_model(model)
        return self



[docs]
    def get_class_weights(self, y):
        """Computes the class weights based on the input target.

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            The target labels.

        Returns
        -------
        np.ndarray
            The computed weights for each class, aligned with `self.classes_`.
        """
        return compute_class_weight(class_weight=self.class_weight, classes=self.classes_, y=y)



[docs]
    def build_error(self, criterion: str, class_weight: np.ndarray) -> TargetError:
        """Builds the error function for the given criterion.

        Parameters
        ----------
        criterion : str
            The error criterion to use (e.g., "entropy", "gini", "gain_ratio").
        class_weight : np.ndarray
            The class weights to be used by the error function.

        Returns
        -------
        TargetError
            An instance of the requested error function from `sklearn_nominal.shared`.

        Raises
        ------
        ValueError
            If the criterion is not recognized.
        """
        classes = len(class_weight)
        errors = {
            "entropy": shared.EntropyError(classes, class_weight),
            "gini": shared.GiniError(classes, class_weight),
            "gain_ratio": shared.EntropyError(classes, class_weight),
        }
        if criterion not in errors.keys():
            raise ValueError(f"Unknown error function {criterion}")
        return errors[criterion]



[docs]
    def predict_proba(self, x: Input) -> Output:
        """Predicts class probabilities for input samples.

        This method first validates the input data to ensure compatibility
        with the fitted model, then delegates the prediction to the backend
        `model_`.

        Parameters
        ----------
        x : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples to predict.

        Returns
        -------
        ndarray of shape (n_samples, n_classes)
            The class probabilities of the input samples. The order of the
            classes corresponds to that in the attribute `classes_`.
        """
        self.check_is_fitted()
        x = self.validate_data_predict(x)
        y = self.model_.predict(x)
        return y



[docs]
    def predict(self, x: Input) -> Output:
        """Predicts class labels for input samples.

        Parameters
        ----------
        x : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples to predict.

        Returns
        -------
        ndarray of shape (n_samples,)
            The predicted class labels.
        """
        p = self.predict_proba(x)
        c = p.argmax(axis=1)
        y = self.le_.inverse_transform(c)
        return y





[docs]
class NominalRegressor(NominalSupervisedModel, RegressorMixin):
    """Base class for nominal regressors.

    This class coordinates the regression workflow for models that handle
    nominal features natively.

    Architectural Context
    ---------------------
    `NominalRegressor` mirrors the structure of `NominalClassifier` but
    specializes in continuous target variables. It manages the conversion
    of targets to at least 2D arrays to ensure consistent handling by the
    backend and provides error function building logic tailored for
    regression (e.g., standard deviation reduction).

    Examples
    --------
    >>> from sklearn_nominal.sklearn.nominal_model import NominalRegressor
    >>> class MyRegressor(NominalRegressor):
    ...     def make_model(self, d):
    ...         # Return a backend-specific trainer
    ...         pass
    """

    def __sklearn_tags__(self):
        """Returns the scikit-learn tags for the regressor.

        Returns
        -------
        Tags
            The scikit-learn tags configured for a regressor.
        """
        tags = super().__sklearn_tags__()  # ty:ignore[unresolved-attribute]
        self.set_sklearn_tags(tags)
        return tags


[docs]
    def validate_data_fit_regression(self, x, y) -> Dataset:
        """Validates and prepares data for regression fitting.

        This method ensures `x` and `y` are compatible, extracts data types,
        and packages them into a backend `Dataset`. It also ensures the target
        `y` is at least 2D for backend consistency.

        Parameters
        ----------
        x : array-like of shape (n_samples, n_features)
            The input features.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values.

        Returns
        -------
        Dataset
            The backend-specific dataset object.
        """
        dtypes = self.get_dtypes(x)
        x, y = validate_data(
            self,
            x,
            y,
            reset=True,
            multi_output=True,
            y_numeric=True,
            ensure_all_finite=False,
            dtype=None,
            accept_sparse=False,
        )
        y = _check_y(y, multi_output=True, y_numeric=True, estimator=self)
        self._y_original_shape = y.shape
        y = atleast_2d(y)
        return make_dataset(self.backend, x, y, self.get_feature_names(), dtypes)



[docs]
    def build_error(self, criterion: str):
        """Builds the regression error function for the given criterion.

        Parameters
        ----------
        criterion : str
            The error criterion to use (e.g., "std" for standard deviation).

        Returns
        -------
        TargetError
            An instance of the requested error function.

        Raises
        ------
        ValueError
            If the criterion is not recognized.
        """
        errors = {
            "std": shared.DeviationError(),
        }
        if criterion not in errors.keys():
            raise ValueError(f"Unknown error function {criterion}")
        return errors[criterion]



[docs]
    def predict(self, x: Input):
        """Predicts target values for input samples.

        Parameters
        ----------
        x : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        ndarray of shape (n_samples,) or (n_samples, n_outputs)
            The predicted target values.
        """
        self.check_is_fitted()
        x = self.validate_data_predict(x)
        y = self.model_.predict(x)
        if len(self._y_original_shape) == 1:
            y = y.squeeze()
        return y



[docs]
    @abc.abstractmethod
    def make_model(self, d: Dataset):
        """Abstract method to create the model trainer.

        Implementation Guidance
        -----------------------
        Subclasses must implement this method to return a trainer object that
        follows the internal library API. For example, a decision tree
        regressor would return a `sklearn_nominal.tree.trainer.TreeTrainer`
        instance configured with regression-specific criteria.

        Parameters
        ----------
        d : Dataset
            The training dataset prepared by `validate_data_fit_regression`.

        Returns
        -------
        trainer : object
            A trainer instance capable of fitting the provided dataset.
        """
        pass



[docs]
    def fit(self, x: Input, y: Output):
        """Fits the nominal regressor.

        Parameters
        ----------
        x : {array-like, sparse matrix} of shape (n_samples, n_features)
            The training input samples.
        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            The target values.

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        d = self.validate_data_fit_regression(x, y)
        trainer = self.make_model(d)
        model = trainer.fit(d)
        self.set_model(model)
        return self