Source code for climb.tool.impl.data_suite.utils.data_utils

import pickle

import matplotlib.pyplot as plt
import numpy as np


[docs] def scaler(fg, bg, center=True): """ > This function takes two arrays, one of foreground data and one of background data, and returns two arrays, one of foreground data and one of background data, where the foreground data is scaled to the background data Args: fg: foreground data bg: background data center: If True, the data will be centered before scaling. Defaults to True Returns: The transformed data. """ if center: fg = fg - np.mean(fg, axis=0) bg = bg - np.mean(bg, axis=0) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(fg) return scaler.transform(fg), scaler.transform(bg)
[docs] def return_diagonal(array, just_diag=True): """ > This function takes a 2D array and returns the diagonal of that array Args: array: the array you want to return the diagonal of just_diag: If True, the function will return the diagonal values of the array. If False, it will return the entire array. Defaults to True Returns: The diagonal values of the array. """ diagonal_vals = [] if just_diag: for i in range(array.shape[0]): for j in range(array.shape[1]): if i != j: array[i, j] = 0 else: diagonal_vals.append(array[i, j]) return array, diagonal_vals else: return array, diagonal_vals
[docs] def covariance_comparison(clean_array, noisy_array): """ > This function takes in two arrays, one clean and one noisy, and returns a list of the indices of the features that have a covariance that is greater than the covariance of the clean data Args: clean_array: The clean data array noisy_array: The array of noisy data Returns: a list of the indices of the features that have a covariance that is greater than 0. """ clean_array, noisy_array = scaler(clean_array, noisy_array, center=True) noisy_array_cov = noisy_array.T.dot( noisy_array, ) / (noisy_array.shape[0] - 1) clean_array_cov = clean_array.T.dot( clean_array, ) / (clean_array.shape[0] - 1) plt.figure(figsize=(10, 10)) data_matrix, diagonal_vals = return_diagonal( noisy_array_cov - clean_array_cov, ) plt.matshow(data_matrix > 0, fignum=1, aspect="auto") plt.colorbar() plt.show() cov_suspects = np.argwhere(np.array(diagonal_vals) > 0) try: if len(cov_suspects) > 1: cov_suspects = list(cov_suspects.squeeze()) else: cov_suspects = [int(cov_suspects.squeeze())] except BaseException: pass return cov_suspects
[docs] def get_suspect_features(clean_corpus, test_dataset, alpha=0.05): """ > This function takes in a clean corpus and a test dataset, and returns a list of feature indices that are statistically different between the two Args: clean_corpus: the clean corpus test_dataset: the dataset you want to test for contamination alpha: the significance level for the KS test. Returns: The suspicious features are being returned. """ from scipy.stats import ks_2samp np.random.seed(123456) suspicious_feat = [] for feat_idx in range(clean_corpus.shape[1]): if ks_2samp(clean_corpus[:, feat_idx], test_dataset[:, feat_idx])[1] < alpha: suspicious_feat.append(feat_idx) return suspicious_feat
[docs] def write_to_file(contents, filename): """ > This function takes in a variable and a filename, and writes the variable to the filename as a pickle file. Args: contents: the data to be written to the file filename: the name of the file to write to """ # write contents to pickle file with open(filename, "wb") as handle: pickle.dump(contents, handle)
[docs] def read_from_file(filename): """ > This function loads a file from a pickle Args: filename: the name of the file to read from Returns: the pickle file. """ # load file from pickle return pickle.load(open(filename, "rb"))