Source code for climb.tool.impl.data_suite.models.conformal
import random
import numpy as np
import pandas as pd
import torch
from nonconformist.base import RegressorAdapter
from nonconformist.icp import IcpRegressor
from nonconformist.nc import (
AbsErrorErrFunc,
RegressorNc,
RegressorNormalizer,
SignErrorErrFunc,
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from climb.tool.impl.data_suite.models import nn_conformal as nnc
# Parameters if using a pytorch NN for the base learner
# desired miscoverage level
alpha = 0.1
# pytorch's optimizer object
nn_learn_func = torch.optim.SGD
# number of epochs
epochs = 1000
# learning rate
lr = 3e-3
# mini-batch size
batch_size = 32
# hidden dimension of the network
hidden_size = 2
# dropout regularization rate
dropout = 0.1
# weight decay regularization
wd = 0
# seed for splitting the data in cross-validation.
cv_test_ratio = 0.1
# ratio of held-out data, used in cross-validation
cv_random_state = 1
# model_dict={
# "rf": RegressorAdapter(RandomForestRegressor(min_samples_leaf=5, random_state=42)),
# "tree": RegressorAdapter(DecisionTreeRegressor(min_samples_leaf=5,random_state=42)),
# "knn": RegressorAdapter(KNeighborsRegressor(n_neighbors=1,random_state=42)),
# "nn": nnc.MSENet_RegressorAdapter(model=None,
# fit_params=None,
# in_shape = 3,
# hidden_size = hidden_size,
# learn_func = nn_learn_func,
# epochs = epochs,
# batch_size=batch_size,
# dropout=dropout,
# lr=lr,
# wd=wd,
# test_ratio=cv_test_ratio,
# random_state=42)
# }
conformity_dict = {"abs": AbsErrorErrFunc(), "sign": SignErrorErrFunc()}
# This class is a wrapper for the conformal prediction library. It takes in a base learner, a
# normalizer, a conformity score, and a boolean for normalization. It also takes in the input
# dimension and a seed.
[docs]
class conformal_class:
def __init__(
self,
base_name="rf",
norm_name="knn",
conformity_score="abs",
normalize=True,
input_dim=2,
seed=42,
):
input_dim = input_dim
if not normalize:
# TODO: define model_dict here
underlying_model = model_dict[base_name] # noqa F821
nc = RegressorNc(
underlying_model,
conformity_dict[conformity_score],
)
else:
if base_name == "rf":
underlying_model = RegressorAdapter(
RandomForestRegressor(
min_samples_leaf=5,
random_state=seed,
),
)
if base_name == "mlp":
underlying_model = RegressorAdapter(
MLPRegressor(random_state=seed),
)
elif base_name == "tree":
underlying_model = RegressorAdapter(
DecisionTreeRegressor(
min_samples_leaf=5,
random_state=seed,
),
)
elif base_name == "knn":
underlying_model = RegressorAdapter(
KNeighborsRegressor(n_neighbors=1),
)
elif base_name == "nn":
underlying_model = nnc.MSENet_RegressorAdapter(
model=None,
fit_params=None,
in_shape=input_dim,
hidden_size=hidden_size,
learn_func=nn_learn_func,
epochs=epochs,
batch_size=batch_size,
dropout=dropout,
lr=lr,
wd=wd,
test_ratio=cv_test_ratio,
random_state=seed,
)
if norm_name == "rf":
normalizing_model = RegressorAdapter(
RandomForestRegressor(
min_samples_leaf=5,
random_state=seed,
),
)
elif norm_name == "tree":
normalizing_model = RegressorAdapter(
DecisionTreeRegressor(
min_samples_leaf=5,
random_state=seed,
),
)
elif norm_name == "knn":
normalizing_model = RegressorAdapter(
KNeighborsRegressor(n_neighbors=1),
)
elif norm_name == "nn":
normalizing_model = nnc.MSENet_RegressorAdapter(
model=None,
fit_params=None,
in_shape=3,
hidden_size=hidden_size,
learn_func=nn_learn_func,
epochs=epochs,
batch_size=batch_size,
dropout=dropout,
lr=lr,
wd=wd,
test_ratio=cv_test_ratio,
random_state=seed,
)
normalizer = RegressorNormalizer(
underlying_model,
normalizing_model,
conformity_dict[conformity_score],
)
nc = RegressorNc(
underlying_model,
conformity_dict[conformity_score],
normalizer,
)
self.seed = seed
self.icp = IcpRegressor(nc)
[docs]
def fit(self, x_train, y_train):
"""
> This function takes in the training data and splits it into a training set and a calibration set.
It is then used to fit the conformal predictor
Args:
x_train: The training data.
y_train: The target variable
"""
x_train, y_train = (
np.array(x_train).astype(np.float32),
np.array(
y_train,
).astype(np.float32),
)
self.x_train, self.y_train = x_train, y_train
random.seed(a=self.seed)
np.random.seed(self.seed)
self.icp.fit(x_train, y_train)
train_indices = random.sample(
range(x_train.shape[0]),
int(x_train.shape[0] * 0.8),
)
calibration_indices = []
for i in range(x_train.shape[0]):
if i not in train_indices:
calibration_indices.append(i)
self.train_indices = train_indices
self.icp.fit(x_train[train_indices, :], y_train[train_indices])
self.icp.calibrate(
x_train[calibration_indices, :],
y_train[calibration_indices],
)
[docs]
def predict(self, x_test, y_test, just_conf=False):
"""
> This function takes in the test data, and returns a dataframe with the confidence intervals, the true
values, and the normalized confidence intervals.
Args:
x_test: the test data
y_test: the true values of the test set
just_conf: If True, only return the confidence intervals. Defaults to False
Returns:
The prediction of the model.
"""
x_test = x_test.astype(np.float32)
prediction = self.icp.predict(x_test, significance=0.1)
header = ["min", "max", "true_val", "conf_interval"]
size = prediction[:, 1] - prediction[:, 0]
table = np.vstack([prediction.T, y_test, size.T]).T
df = pd.DataFrame(table, columns=header)
feature_array = self.y_train[self.train_indices]
feature_range = np.max(feature_array) - np.min(feature_array)
df["norm_interval"] = df["conf_interval"] / feature_range
if just_conf:
return df.conf_interval.values
return df