Source code for sklearn_nominal.sklearn.naive_bayes

import numpy as np
from sklearn.base import BaseEstimator

from sklearn_nominal.backend.core import Dataset
from sklearn_nominal.backend.factory import DEFAULT_BACKEND
from sklearn_nominal.bayes.model import NaiveBayes
from sklearn_nominal.bayes.trainer import NaiveBayesTrainer
from sklearn_nominal.sklearn.nominal_model import NominalClassifier



[docs]
class NaiveBayesClassifier(NominalClassifier, BaseEstimator):
    """A Naive Bayes classifier supporting nominal attributes.

    A NaiveBayesClassifier that mimics `scikit-learn`'s
    :class:`sklearn.tree.GaussianNB` but adds support for nominal
    attributes with categorical distributions.

    Args:
        smoothing (float, optional): The Laplace smoothing factor for categorical
            distributions. This value will be added to the count of each value to
            generate a smoothed categorical distribution. The default value, 0.0,
            indicates no smoothing.
        backend (str, optional): The backend to use for computations. Defaults to "pandas".
        class_weight (dict or "balanced", optional): Weights associated with classes
            in the form ``{class_label: weight}``. If None, all classes are assumed
            to have weight one. The "balanced" mode uses the values of y to
            automatically adjust weights inversely proportional to class frequencies.
            Defaults to None.

    Attributes:
        classes_ (ndarray of shape (n_classes,)): The classes labels.
        n_classes_ (int): The number of classes.
        n_features_in_ (int): Number of features seen during :term:`fit`.
        feature_names_in_ (ndarray of shape (n_features_in_,)): Names of features
            seen during :term:`fit`. Defined only when `X` has feature names that
            are all strings.
        n_outputs_ (int): The number of outputs when ``fit`` is performed.
        model_ (NaiveBayes): The underlying NaiveBayes that actually holds the
            distribution parameters and can perform inference.

    See Also:
        TreeClassifier: A decision tree classifier.

    Notes:
        The :meth:`predict` method operates using the :func:`numpy.argmax`
        function on the outputs of :meth:`predict_proba`. This means that in
        case the highest predicted probabilities are tied, the classifier will
        predict the tied class with the lowest index in :term:`classes_`.

    Examples:
        >>> from sklearn.datasets import fetch_openml
        >>> df = fetch_openml("credit-g",version=2).frame
        >>> x,y = df.iloc[:,0:-1], df.iloc[:,-1]
        >>>
        >>> from sklearn_nominal import NaiveBayesClassifier
        >>> model = NaiveBayesClassifier(smoothing = 0.01)
        >>> model.fit(x,y)
        >>>
        >>> from sklearn.metrics import accuracy_score
        >>> y_pred = model.predict(x)
        >>> print(accuracy_score(y,y_pred))
        ... 0.787
    """

    def __sklearn_tags__(self):
        """Returns the scikit-learn tags for the estimator.

        Returns:
            Tags: The scikit-learn tags.
        """
        tags = super().__sklearn_tags__()
        tags.classifier_tags.poor_score = True
        return tags

    def __init__(self, smoothing=0.0, backend=DEFAULT_BACKEND, class_weight=None):
        """Initializes the NaiveBayesClassifier.

        Args:
            smoothing (float): The Laplace smoothing factor for categorical
                distributions. This value will be added to the count of each value to
                generate a smoothed categorical distribution. The default value, 0.0,
                indicates no smoothing.
            backend (str): The backend to use for computations. Defaults to "pandas".
            class_weight (dict or "balanced", optional): Weights associated with classes
                in the form ``{class_label: weight}``. If None, all classes are assumed
                to have weight one. The "balanced" mode uses the values of y to
                automatically adjust weights inversely proportional to class frequencies.
                Defaults to None.
        """
        super().__init__(backend=backend, class_weight=class_weight)
        self.smoothing = smoothing


[docs]
    def make_model(self, d: Dataset, class_weight: np.ndarray):
        """Creates the NaiveBayesTrainer for the model.

        Args:
            d (Dataset): The dataset to train on.
            class_weight (np.ndarray): The weights for each class.

        Returns:
            NaiveBayesTrainer: The trainer instance for Naive Bayes.
        """
        return NaiveBayesTrainer(class_weight, smoothing=self.smoothing)



[docs]
    def fit(self, x, y):
        """Fit the Naive Bayes model according to the given training data.

        This algorithm calculates the prior probabilities of each class and
        the conditional probabilities of each feature given the class. For
        nominal features, categorical distributions are estimated (with
        Laplace smoothing if requested). For numeric features, Gaussian
        distributions are typically assumed.

        Args:
            x (pd.DataFrame or np.ndarray): The training input samples.
            y (np.ndarray): The target values (class labels) as integers or strings.

        Returns:
            self: Returns the instance itself.
        """
        return super().fit(x, y)



[docs]
    def predict(self, x):
        """Perform classification on an array of test vectors X.

        Predictions are made using Bayes' Theorem by multiplying the prior
        class probability with the conditional probabilities of all feature
        values given the class, and selecting the class with the highest
        posterior probability. Ties are resolved by choosing the class with the
        lowest index in :term:`classes_`.

        Args:
            x (pd.DataFrame or np.ndarray): The input samples.

        Returns:
            np.ndarray: Predicted target values for X.
        """
        return super().predict(x)



[docs]
    def predict_proba(self, x):
        """Return probability estimates for the test data X.

        Probabilities are calculated by normalizing the posterior
        probabilities obtained via Bayes' Theorem for each class across all
        samples.

        Args:
            x (pd.DataFrame or np.ndarray): The input samples.

        Returns:
            np.ndarray: Returns the probability of the sample for each class
                in the model.
        """
        return super().predict_proba(x)


    def plot_distributions(
        self, class_names: list[str] | None = None, feature_names: list[str] | None = None, n_cols=4
    ):
        self.check_is_fitted()
        model: NaiveBayes = self.model_
        model.plot_distributions(class_names, feature_names, n_cols)

    def explain(self, x, class_names: list[str] | None = None):
        self.check_is_fitted()
        return self.model_.explain(x, class_names)