Source code for climb.tool.impl.data_suite.data.data_loader

import random
from copy import deepcopy

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


[docs] def generate_synthetic_large(num_samples=1000): """ > This function generates a random multivariate normal distribution with the given mean and covariance matrix Args: num_samples: The number of samples to generate. Defaults to 1000 Returns: A tuple of two numpy arrays for train and test """ # The desired mean values of the sample. mu = np.array([5.0, 0.0, 10.0, 2.0, 7.0]) # The desired covariance matrix. r = np.array( [ [3.40, -2.75, -2.00, -5.75, -2.00], [-2.75, 5.50, 1.50, 2.75, 3.00], [-5.00, 1.50, 1.25, 0.75, -3.00], [2.00, 1.50, 3.25, 2.75, 1.00], [5.00, 3.50, 5.25, -0.75, 2.00], ], ) # Generate the random samples. data = np.random.multivariate_normal(mu, r, size=num_samples) train, test = train_test_split(data, test_size=0.66, random_state=42) return train, test
[docs] def generate_synthetic_small(num_samples=1000): """ > This function generates a random sample of data from a multivariate normal distribution with a specified mean and covariance matrix Args: num_samples: The number of samples to generate. Defaults to 1000 Returns: A tuple of two numpy arrays for train and test """ # The desired mean values of the sample. mu = np.array([5.0, 0.0, 10.0]) # The desired covariance matrix. r = np.array( [[3.40, -2.75, -2.00], [-2.75, 5.50, 1.50], [-2.00, 1.50, 1.25]], ) # Generate the random samples. data = np.random.multivariate_normal(mu, r, size=num_samples) train, test = train_test_split(data, test_size=0.66, random_state=42) return train, test
[docs] def corrupt_data_func( data, feat_list, mean=0, variance=1, proportion=0.5, dist="normal", ): """ > This function takes in a dataframe, a list of features to corrupt, and a distribution to corrupt the data with. It then corrupts the data with the specified distribution and returns the corrupted data, the original data, a list of the corrupted data points, a list of the noise added to the data, and a list of the indices of the corrupted data points. Args: data: the data you want to corrupt feat_list: the list of features to corrupt mean: the mean of the distribution you want to sample from. Defaults to 0 variance: the variance of the noise. Defaults to 1 proportion: the proportion of data that will be corrupted dist: the distribution of the noise. Defaults to normal Returns: corrupt_data, data, corrupt_ids, noise, noise_id """ data_corrupt = deepcopy(data) for feat_idx in feat_list: corrupt_ids = [] corrupt_data = [] noise = [] noise_id = [] for i in range(len(data_corrupt)): value = data_corrupt[i, feat_idx] if random.random() < proportion: if dist == "normal": noisy = np.random.normal(mean, variance) if dist == "beta": noisy = np.random.beta(8, 2) if dist == "weibull": noisy = np.random.weibull(2) if dist == "gamma": noisy = np.random.gamma(1, 2) noise.append(noisy) corrupt_data.append(value + noisy) corrupt_ids.append(1) noise_id.append(i) else: corrupt_ids.append(0) noise.append(0) corrupt_data.append(value) return corrupt_data, data, corrupt_ids, noise, noise_id
[docs] def load_synthetic_data( n_synthetic=1000, mean=0, noise_variance=0, dim="small", prop="0.5", dist="normal", ): """ > This function generates a synthetic dataset with a specified number of samples, mean, noise variance, dimensionality, proportion of noise, and distribution of noise Args: n_synthetic: number of samples to generate. Defaults to 1000 mean: mean of the noise distribution. Defaults to 0 noise_variance: the variance of the noise distribution. Defaults to 0 dim: "small" or "large". Defaults to small prop: proportion of data to corrupt. Defaults to 0.5 dist: the distribution of the noise. Can be "normal" or "uniform". Defaults to normal """ if dim == "small": train, test_clean = generate_synthetic_small(num_samples=n_synthetic) if dim == "large": train, test_clean = generate_synthetic_large(num_samples=n_synthetic) test_corrupted, orig_test, noise_bool, noise_values, noise_idx = corrupt_data_func( data=test_clean, feat_list=[0], mean=0, variance=noise_variance, proportion=prop, dist=dist, ) y_test_ids = noise_bool test = deepcopy(orig_test) test[:, 0] = test_corrupted return train, test, orig_test, y_test_ids, noise_values, noise_idx
[docs] def load_adult_data(split_size=0.3): """ > This function loads the adult dataset, removes all the rows with missing values, and then splits the data into a training and test set Args: split_size: The proportion of the dataset to include in the test split. Returns: X_train, X_test, y_train, y_test, X, y """ def process_dataset(df): """ > This function takes a dataframe, maps the categorical variables to numerical values, and returns a dataframe with the numerical values Args: df: The dataframe to be processed Returns: a dataframe after the mapping """ data = [df] salary_map = {" <=50K": 1, " >50K": 0} df["salary"] = df["salary"].map(salary_map).astype(int) df["sex"] = df["sex"].map({" Male": 1, " Female": 0}).astype(int) df["country"] = df["country"].replace(" ?", np.nan) df["workclass"] = df["workclass"].replace(" ?", np.nan) df["occupation"] = df["occupation"].replace(" ?", np.nan) df.dropna(how="any", inplace=True) for dataset in data: dataset.loc[ dataset["country"] != " United-States", "country", ] = "Non-US" dataset.loc[ dataset["country"] == " United-States", "country", ] = "US" df["country"] = df["country"].map({"US": 1, "Non-US": 0}).astype(int) df["marital-status"] = df["marital-status"].replace( [ " Divorced", " Married-spouse-absent", " Never-married", " Separated", " Widowed", ], "Single", ) df["marital-status"] = df["marital-status"].replace( [" Married-AF-spouse", " Married-civ-spouse"], "Couple", ) df["marital-status"] = df["marital-status"].map( {"Couple": 0, "Single": 1}, ) rel_map = { " Unmarried": 0, " Wife": 1, " Husband": 2, " Not-in-family": 3, " Own-child": 4, " Other-relative": 5, } df["relationship"] = df["relationship"].map(rel_map) race_map = { " White": 0, " Amer-Indian-Eskimo": 1, " Asian-Pac-Islander": 2, " Black": 3, " Other": 4, } df["race"] = df["race"].map(race_map) def f(x): if x["workclass"] == " Federal-gov" or x["workclass"] == " Local-gov" or x["workclass"] == " State-gov": return "govt" elif x["workclass"] == " Private": return "private" elif x["workclass"] == " Self-emp-inc" or x["workclass"] == " Self-emp-not-inc": return "self_employed" else: return "without_pay" df["employment_type"] = df.apply(f, axis=1) employment_map = { "govt": 0, "private": 1, "self_employed": 2, "without_pay": 3, } df["employment_type"] = df["employment_type"].map(employment_map) df.drop( labels=[ "workclass", "education", "occupation", ], axis=1, inplace=True, ) df.loc[(df["capital-gain"] > 0), "capital-gain"] = 1 df.loc[(df["capital-gain"] == 0, "capital-gain")] = 0 df.loc[(df["capital-loss"] > 0), "capital-loss"] = 1 df.loc[(df["capital-loss"] == 0, "capital-loss")] = 0 return df try: df = pd.read_csv("data/adult.csv", delimiter=",") except BaseException: df = pd.read_csv("../data/adult.csv", delimiter=",") df = process_dataset(df) df_sex_1 = df.query("sex ==1") salary_1_idx = df.query("sex == 0 & salary == 1") salary_0_idx = df.query("sex == 0 & salary == 0") X = df_sex_1.drop(["salary"], axis=1) y = df_sex_1["salary"] # Creation of Train and Test dataset random.seed(a=42) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=split_size, random_state=42, ) sample_ids = random.sample(range(len(X_train)), X_train.shape[0]) X_train = X_train.iloc[sample_ids, :] y_train = y_train.iloc[sample_ids] X_train = np.vstack([X_train, salary_0_idx.drop(["salary"], axis=1)]) X_test = np.vstack([X_test, salary_1_idx.drop(["salary"], axis=1)]) y_train = np.hstack([y_train, salary_0_idx["salary"]]) y_test = np.hstack([y_test, salary_1_idx["salary"]]) return X_train, X_test, y_train, y_test, X, y
[docs] def load_electric(path="electricity.arff"): """ > This function loads the electric dataset from the file, encodes the class labels, and returns the training and test sets Args: path: the path to the dataset. Defaults to elecNormNew.arff Returns: X_train, X_test, y_train, y_test """ from scipy.io.arff import loadarff from sklearn.preprocessing import OrdinalEncoder try: raw_data = loadarff(f"data/{path}") except BaseException: raw_data = loadarff(f"../data/{path}") df = pd.DataFrame(raw_data[0]) ord_enc = OrdinalEncoder() df["class_encoded"] = ord_enc.fit_transform(df[["class"]]) X_train = df.iloc[0:10000:, 2:8] y_train = df.iloc[0:10000:, -1] X_val = df.iloc[10000:15000:, 2:8] y_val = df.iloc[10000:15000:, -1] X_test_real = df.iloc[30000:40000:, 2:8] y_test_real = df.iloc[30000:40000:, -1] X_test = pd.concat([X_val, X_test_real]) y_test = pd.concat([y_val, y_test_real]) return X_train, X_test, y_train, y_test