Source code for climb.tool.impl.data_suite.utils.helpers

from copy import deepcopy


[docs] def inlier_outlier_dicts(conformal_dict, suspect_features): """ For each feature, we create a dataframe that contains the true value, the lower bound, the upper bound, and the confidence interval. We then create a column called "outlier" that is True if the true value is not within the confidence interval. We use the CIs to assign two dictionaries one for the inliers and one for the outliers Args: conformal_dict: a dictionary of dataframes, where each dataframe contains the conformal prediction intervals for a given feature. suspect_features: a list of features that you want to check for outliers Returns: A dictionary of inliers and a dictionary of outliers. """ inliers_dict = {} outliers_dict = {} for feature in suspect_features: feature = int(feature) mydf = deepcopy(conformal_dict[feature]) def func(truth, min_val, max_val, interval): epsilon = 0.05 * interval return not ((truth > min_val - epsilon) & (truth < max_val + epsilon)) mydf["outlier"] = mydf.apply( lambda x: func( x["true_val"], x["min"], x["max"], x["conf_interval"], ), axis=1, ) outlier_df = mydf[mydf["outlier"]] inlier_df = mydf[~mydf["outlier"]] outlier_ids = outlier_df.index.values inlier_ids = inlier_df.index.values outliers_dict[feature] = outlier_ids inliers_dict[feature] = inlier_ids return inliers_dict, outliers_dict
[docs] def sort_cis_synth(conformal_dict, inliers_dict, suspect_features, proportion=0.1): """ > This function takes a dictionary of conformal intervals, a dictionary of inlier ids, and a list of suspect features. It then creates a dataframe of the conformal intervals for the first suspect feature, and then adds the conformal intervals for the other suspect features to the dataframe. It then sorts the dataframe by the norm_interval column, and returns the ids of the top and bottom proportion of the dataframe Args: conformal_dict: a dictionary of dataframes, where each dataframe is the conformal intervals for a feature inliers_dict: a dictionary of inlier ids for each feature suspect_features: a list of features that are suspected to be problematic proportion: the proportion of the data to use as certain and uncertain Returns: the indices of the samples with the smallest and largest confidence intervals. """ feature = suspect_features[0] df_conformal = conformal_dict[feature] inlier_ids = inliers_dict[feature] df_inlier = df_conformal.iloc[inlier_ids, :] df_inlier[f"{feature}_contrib"] = df_inlier["norm_interval"] nsamples = int(len(df_conformal) * proportion) if len(suspect_features) > 1: for feat in suspect_features[1:]: print(f"suspect - {feat}") df_conformal = conformal_dict[feat] # inlier_ids = inliers_dict[feat] df_inlier_feat = df_conformal # .iloc[inlier_ids,:] df_inlier[f"{feat}_contrib"] = df_inlier_feat["norm_interval"] df_inlier = df_inlier.add(df_inlier_feat, fill_value=0) df_sorted = df_inlier.sort_values(by=["norm_interval"], ascending=True) # small_ci_ids = sorted_ids[0:nsamples] #df_sorted.index.values[0:nsamples] small_ci_ids = df_sorted.index.values[0:nsamples] # df_sorted = df_inlier.sort_values(by=['norm_interval'], ascending=False) large_ci_ids = df_sorted.index.values[-nsamples:] return small_ci_ids, large_ci_ids, df_sorted
[docs] def sort_cis_all(conformal_dict, inliers_dict, suspect_features): feature = suspect_features[0] proportion = 0.5 df_conformal = conformal_dict[feature] inlier_ids = inliers_dict[feature] df_inlier = df_conformal.iloc[inlier_ids, :] nsamples = int(len(df_conformal) * proportion) if len(suspect_features) > 1: for feat in suspect_features[1:]: print(f"suspect - {feat}") df_conformal = conformal_dict[feat] inlier_ids = inliers_dict[feat] df_inlier_feat = df_conformal.iloc[inlier_ids, :] df_inlier = df_inlier.add(df_inlier_feat, fill_value=0) df_sorted = df_inlier.sort_values(by=["norm_interval"], ascending=True) # small_ci_ids = sorted_ids[0:nsamples] #df_sorted.index.values[0:nsamples] small_ci_ids = df_sorted.index.values[0:nsamples] # df_sorted = df_inlier.sort_values(by=['norm_interval'], ascending=False) large_ci_ids = df_sorted.index.values[-nsamples:] return small_ci_ids, large_ci_ids, df_sorted
[docs] def sort_ci_vals(conformal_dict, inliers_dict, suspect_features, proportion=0.1): """ > This function takes in a dictionary of conformal inference results, a dictionary of inlier results, a list of suspect features, and a proportion of the data to be used for the analysis. It then returns the indices of the data points with the smallest and largest confidence intervals, and a dataframe with the sorted confidence intervals. Args: conformal_dict: a dictionary of dataframes, where each dataframe is the conformal intervals for a feature inliers_dict: a dictionary of inlier ids for each feature suspect_features: a list of features that are suspected to be problematic proportion: the proportion of the data to use as certain and uncertain Returns: the indices of the samples with the smallest and largest confidence intervals. """ feature = suspect_features[0] df_conformal = conformal_dict[feature] inliers_dict[feature] df_inlier = df_conformal df_inlier[f"{feature}_contrib"] = df_inlier["norm_interval"] nsamples = int(len(df_conformal) * proportion) if len(suspect_features) > 1: for feat in suspect_features[1:]: print(f"Evaluating feature - {feat}") df_conformal = conformal_dict[feat] df_inlier_feat = df_conformal df_inlier[f"{feat}_contrib"] = df_inlier_feat["norm_interval"] df_inlier = df_inlier.add(df_inlier_feat, fill_value=0) df_sorted = df_inlier.sort_values(by=["norm_interval"], ascending=True) small_ci_ids = df_sorted.index.values[0:nsamples] large_ci_ids = df_sorted.index.values[-nsamples:] return small_ci_ids, large_ci_ids, df_sorted