Source code for climb.tool.impl.data_suite.models.conformal

import random

import numpy as np
import pandas as pd
import torch
from nonconformist.base import RegressorAdapter
from nonconformist.icp import IcpRegressor
from nonconformist.nc import (
    AbsErrorErrFunc,
    RegressorNc,
    RegressorNormalizer,
    SignErrorErrFunc,
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

from climb.tool.impl.data_suite.models import nn_conformal as nnc

# Parameters if using a pytorch NN for the base learner
# desired miscoverage level
alpha = 0.1
# pytorch's optimizer object
nn_learn_func = torch.optim.SGD
# number of epochs
epochs = 1000
# learning rate
lr = 3e-3
# mini-batch size
batch_size = 32
# hidden dimension of the network
hidden_size = 2
# dropout regularization rate
dropout = 0.1
# weight decay regularization
wd = 0
# seed for splitting the data in cross-validation.
cv_test_ratio = 0.1
# ratio of held-out data, used in cross-validation
cv_random_state = 1


# model_dict={

#     "rf": RegressorAdapter(RandomForestRegressor(min_samples_leaf=5, random_state=42)),
#     "tree": RegressorAdapter(DecisionTreeRegressor(min_samples_leaf=5,random_state=42)),
#     "knn": RegressorAdapter(KNeighborsRegressor(n_neighbors=1,random_state=42)),
#     "nn": nnc.MSENet_RegressorAdapter(model=None,
#                                        fit_params=None,
#                                        in_shape = 3,
#                                        hidden_size = hidden_size,
#                                        learn_func = nn_learn_func,
#                                        epochs = epochs,
#                                        batch_size=batch_size,
#                                        dropout=dropout,
#                                        lr=lr,
#                                        wd=wd,
#                                        test_ratio=cv_test_ratio,
#                                        random_state=42)
# }

conformity_dict = {"abs": AbsErrorErrFunc(), "sign": SignErrorErrFunc()}


# This class is a wrapper for the conformal prediction library. It takes in a base learner, a
# normalizer, a conformity score, and a boolean for normalization. It also takes in the input
# dimension and a seed.

[docs]
class conformal_class:
    def __init__(
        self,
        base_name="rf",
        norm_name="knn",
        conformity_score="abs",
        normalize=True,
        input_dim=2,
        seed=42,
    ):
        input_dim = input_dim
        if not normalize:
            # TODO: define model_dict here
            underlying_model = model_dict[base_name]  # noqa F821
            nc = RegressorNc(
                underlying_model,
                conformity_dict[conformity_score],
            )

        else:
            if base_name == "rf":
                underlying_model = RegressorAdapter(
                    RandomForestRegressor(
                        min_samples_leaf=5,
                        random_state=seed,
                    ),
                )

            if base_name == "mlp":
                underlying_model = RegressorAdapter(
                    MLPRegressor(random_state=seed),
                )

            elif base_name == "tree":
                underlying_model = RegressorAdapter(
                    DecisionTreeRegressor(
                        min_samples_leaf=5,
                        random_state=seed,
                    ),
                )

            elif base_name == "knn":
                underlying_model = RegressorAdapter(
                    KNeighborsRegressor(n_neighbors=1),
                )

            elif base_name == "nn":
                underlying_model = nnc.MSENet_RegressorAdapter(
                    model=None,
                    fit_params=None,
                    in_shape=input_dim,
                    hidden_size=hidden_size,
                    learn_func=nn_learn_func,
                    epochs=epochs,
                    batch_size=batch_size,
                    dropout=dropout,
                    lr=lr,
                    wd=wd,
                    test_ratio=cv_test_ratio,
                    random_state=seed,
                )

            if norm_name == "rf":
                normalizing_model = RegressorAdapter(
                    RandomForestRegressor(
                        min_samples_leaf=5,
                        random_state=seed,
                    ),
                )

            elif norm_name == "tree":
                normalizing_model = RegressorAdapter(
                    DecisionTreeRegressor(
                        min_samples_leaf=5,
                        random_state=seed,
                    ),
                )

            elif norm_name == "knn":
                normalizing_model = RegressorAdapter(
                    KNeighborsRegressor(n_neighbors=1),
                )

            elif norm_name == "nn":
                normalizing_model = nnc.MSENet_RegressorAdapter(
                    model=None,
                    fit_params=None,
                    in_shape=3,
                    hidden_size=hidden_size,
                    learn_func=nn_learn_func,
                    epochs=epochs,
                    batch_size=batch_size,
                    dropout=dropout,
                    lr=lr,
                    wd=wd,
                    test_ratio=cv_test_ratio,
                    random_state=seed,
                )

            normalizer = RegressorNormalizer(
                underlying_model,
                normalizing_model,
                conformity_dict[conformity_score],
            )
            nc = RegressorNc(
                underlying_model,
                conformity_dict[conformity_score],
                normalizer,
            )

        self.seed = seed
        self.icp = IcpRegressor(nc)


[docs]
    def fit(self, x_train, y_train):
        """
        > This function takes in the training data and splits it into a training set and a calibration set.
        It is then used to fit the conformal predictor

        Args:
          x_train: The training data.
          y_train: The target variable
        """

        x_train, y_train = (
            np.array(x_train).astype(np.float32),
            np.array(
                y_train,
            ).astype(np.float32),
        )
        self.x_train, self.y_train = x_train, y_train
        random.seed(a=self.seed)
        np.random.seed(self.seed)
        self.icp.fit(x_train, y_train)
        train_indices = random.sample(
            range(x_train.shape[0]),
            int(x_train.shape[0] * 0.8),
        )
        calibration_indices = []
        for i in range(x_train.shape[0]):
            if i not in train_indices:
                calibration_indices.append(i)

        self.train_indices = train_indices

        self.icp.fit(x_train[train_indices, :], y_train[train_indices])
        self.icp.calibrate(
            x_train[calibration_indices, :],
            y_train[calibration_indices],
        )



[docs]
    def predict(self, x_test, y_test, just_conf=False):
        """
        > This function takes in the test data, and returns a dataframe with the confidence intervals, the true
        values, and the normalized confidence intervals.

        Args:
          x_test: the test data
          y_test: the true values of the test set
          just_conf: If True, only return the confidence intervals. Defaults to False

        Returns:
          The prediction of the model.
        """

        x_test = x_test.astype(np.float32)
        prediction = self.icp.predict(x_test, significance=0.1)
        header = ["min", "max", "true_val", "conf_interval"]
        size = prediction[:, 1] - prediction[:, 0]

        table = np.vstack([prediction.T, y_test, size.T]).T
        df = pd.DataFrame(table, columns=header)

        feature_array = self.y_train[self.train_indices]
        feature_range = np.max(feature_array) - np.min(feature_array)
        df["norm_interval"] = df["conf_interval"] / feature_range

        if just_conf:
            return df.conf_interval.values

        return df