Source code for sklearn_nominal.sklearn.rule_cn2

import sys

import numpy as np
import pandas as pd
from scipy.odr import Output
from sklearn.base import BaseEstimator
from sklearn.utils import compute_class_weight

from sklearn_nominal.backend import Input
from sklearn_nominal.backend.core import Dataset
from sklearn_nominal.backend.factory import DEFAULT_BACKEND
from sklearn_nominal.rules.cn2 import CN2
from sklearn_nominal.rules.oner import OneR
from sklearn_nominal.rules.prism import PRISM
from sklearn_nominal.sklearn.nominal_model import NominalClassifier, NominalRegressor

eps = 1e-16



[docs]
class CN2Classifier(NominalClassifier, BaseEstimator):
    """A rule-based classifier that performs sequential covering in a CN2 [1] style.

    [1] Clark, P. and Niblett, T (1989) The CN2 induction algorithm. Machine Learning 3(4):261-283.

    Args:
        criterion (str, optional): The function to measure the quality of a split.
            Supported criteria are "gini" for the Gini impurity and "log_loss"
            and "entropy" both for the Shannon information gain. Defaults to "entropy".
        max_rule_length (int, optional): The maximum number of conditions in a rule.
            Analogous to the maximum height of a Tree model. Defaults to sys.maxsize.
        max_rules (int, optional): Maximum number of rules for the model.
            Analogous to the maximum number of leaves in a Tree model. Defaults to sys.maxsize.
        min_rule_support (int, optional): Minimum number of samples that satisfy
            the condition of a rule required to include that rule in the model.
            Analogous to the `min_samples_leaf` parameter for Tree models.
            Defaults to 10.
        max_error_per_rule (float, optional): Maximum (absolute) error that the
            rule can have. This value depends on the error (criterion) used for the model.
            Defaults to 0.99.
        backend (str, optional): The backend to use for computations. Defaults to DEFAULT_BACKEND.
        class_weight (dict or "balanced", optional): Weights associated with classes
            in the form ``{class_label: weight}``. If None, all classes are assumed
            to have weight one. Defaults to None.

    Attributes:
        classes_ (ndarray of shape (n_classes,)): The classes labels.
        n_classes_ (int): The number of classes.
        n_features_in_ (int): Number of features seen during :term:`fit`.
        feature_names_in_ (ndarray of shape (n_features_in_,)): Names of features
            seen during :term:`fit`. Defined only when `X` has feature names that
            are all strings.
        n_outputs_ (int): The number of outputs when ``fit`` is performed.
        model_ (RuleModel): The underlying model object.

    See Also:
        TreeRegressor: A decision tree regressor with nominal support.
        NaiveBayesClassifier: A NaiveBayesClassifier with nominal support.
        ZeroRClassifier: A ZeroR classifier with nominal support.
        OneRClassifier: A OneR classifier with nominal support.
        PRISMClassifier: A PRISM classifier with nominal support.

    Examples:
        >>> from sklearn.datasets import fetch_openml
        >>> df = fetch_openml("credit-g",version=2).frame
        >>> x,y = df.iloc[:,0:-1], df.iloc[:,-1]
        >>>
        >>> from sklearn_nominal import CN2Classifier
        >>> model = CN2Classifier()
        >>> model.fit(x,y)
        >>>
        >>> from sklearn.metrics import accuracy_score
        >>> y_pred = model.predict(x)
        >>> print(accuracy_score(y,y_pred))
        ... 0.787
    """

    def __init__(
        self,
        criterion="entropy",
        max_rule_length: int = sys.maxsize,
        max_rules: int = sys.maxsize,
        min_rule_support=10,
        max_error_per_rule=0.99,
        backend=DEFAULT_BACKEND,
        class_weight: np.ndarray | None = None,
    ):
        """Initializes the CN2Classifier.

        Args:
            criterion (str): The function to measure the quality of a split.
                Supported criteria are "gini" for the Gini impurity and "log_loss"
                and "entropy" both for the Shannon information gain.
                Defaults to "entropy".
            max_rule_length (int): The maximum number of conditions in a rule.
                Defaults to sys.maxsize.
            max_rules (int): Maximum number of rules for the model.
                Defaults to sys.maxsize.
            min_rule_support (int): Minimum number of samples that satisfy
                the condition of a rule required to include that rule in the model.
                Defaults to 10.
            max_error_per_rule (float): Maximum (absolute) error that the
                rule can have. Defaults to 0.99.
            backend (str): The backend to use for computations.
                Defaults to DEFAULT_BACKEND.
            class_weight (dict or "balanced", optional): Weights associated with classes
                in the form ``{class_label: weight}``. If None, all classes are assumed
                to have weight one. Defaults to None.
        """
        super().__init__(backend=backend, class_weight=class_weight)
        self.max_rule_length = max_rule_length
        self.max_rules = max_rules
        self.min_rule_support = min_rule_support
        self.max_error_per_rule = max_error_per_rule
        self.criterion = criterion


[docs]
    def make_model(self, d: Dataset, class_weight: np.ndarray):
        """Creates the CN2 trainer for the model.

        Args:
            d (Dataset): The dataset to train on.
            class_weight (np.ndarray): The weights for each class.

        Returns:
            CN2: The CN2 trainer instance.
        """
        error = self.build_error(self.criterion, class_weight)
        return CN2(
            error,
            self.max_rule_length,
            self.max_rules,
            self.min_rule_support,
            self.max_error_per_rule,
        )



[docs]
    def fit(self, x, y):
        """Fit the CN2 model according to the given training data.

        The CN2 algorithm induces a set of classification rules using a
        sequential covering (or "separate-and-conquer") process. It repeatedly
        identifies a rule that covers a subset of the training data and
        removes the covered samples until a sufficient number of rules is
        found or all samples are covered.

        Args:
            x (pd.DataFrame or np.ndarray): The training input samples.
            y (np.ndarray): The target values (class labels) as integers or strings.

        Returns:
            self: Returns the instance itself.
        """
        return super().fit(x, y)



[docs]
    def predict(self, x):
        """Perform classification on an array of test vectors X.

        For each input sample, the algorithm evaluates the learned rules in
        order. The first rule that matches the sample's features determines
        the predicted class. If no rules match, the default class (based on
        the distribution of uncovered training samples) is used.

        Args:
            x (pd.DataFrame or np.ndarray): The input samples.

        Returns:
            np.ndarray: Predicted target values for X.
        """
        return super().predict(x)



[docs]
    def predict_proba(self, x):
        """Return probability estimates for the test data X.

        Probabilities are estimated from the class distribution of training
        samples covered by the first rule that matches the input sample. If no
        rules match, the probability distribution is based on the uncovered
        training samples (the default rule).

        Args:
            x (pd.DataFrame or np.ndarray): The input samples.

        Returns:
            np.ndarray: Returns the probability of the sample for each class
                in the model.
        """
        return super().predict_proba(x)





[docs]
class CN2Regressor(NominalRegressor, BaseEstimator):
    """A rule-based regressor that performs sequential covering in a CN2 [1] style.

    [1] Clark, P. and Niblett, T (1989) The CN2 induction algorithm. Machine Learning 3(4):261-283.

    Args:
        criterion (str, optional): The function to measure the error of a split.
            Supported criteria are currently only "std", for standard deviation
            (equivalent to root MSE). Defaults to "std".
        max_rule_length (int, optional): The maximum number of conditions in a rule.
            Analogous to the maximum height of a Tree model. Defaults to sys.maxsize.
        max_rules (int, optional): Maximum number of rules for the model.
            Analogous to the maximum number of leaves in a Tree model. Defaults to sys.maxsize.
        min_rule_support (int, optional): Minimum number of samples that satisfy
            the condition of a rule required to include that rule in the model.
            Analogous to the `min_samples_leaf` parameter for Tree models.
            Defaults to 10.
        max_error_per_rule (float, optional): Maximum (absolute) error that the
            rule can have. This value depends on the error (criterion) used for the model.
            Defaults to 0.99.
        backend (str, optional): The backend to use for computations. Defaults to DEFAULT_BACKEND.

    Attributes:
        n_features_in_ (int): Number of features seen during :term:`fit`.
        feature_names_in_ (ndarray of shape (n_features_in_,)): Names of features
            seen during :term:`fit`. Defined only when `X` has feature names that
            are all strings.
        n_outputs_ (int): The number of outputs when ``fit`` is performed.
        model_ (RuleModel): The underlying model object.

    See Also:
        TreeRegressor: A decision tree regressor with nominal support.
        ZeroRRegressor: A ZeroR classifier regressor with nominal support.
        OneRRegressor: A OneR regressor with nominal support.

    Examples:
        >>> from sklearn_nominal import CN2Regressor, read_golf_regression_dataset
        >>> x, y = read_golf_regression_dataset(url)
        >>> model = CN2Regressor()
        >>> from sklearn.metrics import mean_absolute_error
        >>> model.fit(x, y)
        >>> y_pred = model.predict(x)
        >>> print(f"{mean_absolute_error(y, y_pred):.2f}")
        0.07
    """

    def __init__(
        self,
        criterion="std",
        max_rule_length: int = sys.maxsize,
        max_rules: int = sys.maxsize,
        min_rule_support=10,
        max_error_per_rule=0.99,
        backend=DEFAULT_BACKEND,
    ):
        """Initializes the CN2Regressor.

        Args:
            criterion (str): The function to measure the error of a split.
                Supported criteria are currently only "std", for standard deviation
                (equivalent to root MSE). Defaults to "std".
            max_rule_length (int): The maximum number of conditions in a rule.
                Defaults to sys.maxsize.
            max_rules (int): Maximum number of rules for the model.
                Defaults to sys.maxsize.
            min_rule_support (int): Minimum number of samples that satisfy
                the condition of a rule required to include that rule in the model.
                Defaults to 10.
            max_error_per_rule (float): Maximum (absolute) error that the
                rule can have. Defaults to 0.99.
            backend (str): The backend to use for computations.
                Defaults to DEFAULT_BACKEND.
        """
        super().__init__(backend=backend)
        self.max_rule_length = max_rule_length
        self.max_rules = max_rules
        self.min_rule_support = min_rule_support
        self.max_error_per_rule = max_error_per_rule
        self.criterion = criterion


[docs]
    def make_model(self, d: Dataset):
        """Creates the CN2 trainer for the model.

        Args:
            d (Dataset): The dataset to train on.

        Returns:
            CN2: The CN2 trainer instance.
        """
        error = self.build_error(self.criterion)
        return CN2(
            error,
            self.max_rule_length,
            self.max_rules,
            self.min_rule_support,
            self.max_error_per_rule,
        )



[docs]
    def fit(self, x, y):
        """Fit the CN2 model according to the given training data.

        The CN2 algorithm induces a set of regression rules using a
        sequential covering process. It iteratively identifies rules that
        minimize regression error for a subset of the training data and
        removes the covered samples.

        Args:
            x (pd.DataFrame or np.ndarray): The training input samples.
            y (np.ndarray): The target values (real numbers).

        Returns:
            self: Returns the instance itself.
        """
        return super().fit(x, y)



[docs]
    def predict(self, x):
        """Predict regression value for X.

        For each input sample, the algorithm evaluates the learned rules in
        order. The first rule that matches the sample determines the
        predicted target value (usually the mean of samples covered by that
        rule during training).

        Args:
            x (pd.DataFrame or np.ndarray): The input samples.

        Returns:
            np.ndarray: Predicted target values for X.
        """
        return super().predict(x)