Source code for climb.tool.impl.data_suite.utils.data_utils

import pickle

import matplotlib.pyplot as plt
import numpy as np



[docs]
def scaler(fg, bg, center=True):
    """
    > This function takes two arrays, one of foreground data and one of background data, and returns two arrays, one
    of foreground data and one of background data, where the foreground data is scaled to the background
    data

    Args:
      fg: foreground data
      bg: background data
      center: If True, the data will be centered before scaling. Defaults to True

    Returns:
      The transformed data.
    """
    if center:
        fg = fg - np.mean(fg, axis=0)
        bg = bg - np.mean(bg, axis=0)

    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()
    scaler.fit(fg)
    return scaler.transform(fg), scaler.transform(bg)




[docs]
def return_diagonal(array, just_diag=True):
    """
    > This function takes a 2D array and returns the diagonal of that array

    Args:
      array: the array you want to return the diagonal of
      just_diag: If True, the function will return the diagonal values of the array. If False, it will
    return the entire array. Defaults to True

    Returns:
      The diagonal values of the array.
    """
    diagonal_vals = []
    if just_diag:
        for i in range(array.shape[0]):
            for j in range(array.shape[1]):
                if i != j:
                    array[i, j] = 0
                else:
                    diagonal_vals.append(array[i, j])
        return array, diagonal_vals
    else:
        return array, diagonal_vals




[docs]
def covariance_comparison(clean_array, noisy_array):
    """
    > This function takes in two arrays, one clean and one noisy, and returns a list of the indices of the features
    that have a covariance that is greater than the covariance of the clean data

    Args:
      clean_array: The clean data array
      noisy_array: The array of noisy data

    Returns:
      a list of the indices of the features that have a covariance that is greater than 0.
    """

    clean_array, noisy_array = scaler(clean_array, noisy_array, center=True)

    noisy_array_cov = noisy_array.T.dot(
        noisy_array,
    ) / (noisy_array.shape[0] - 1)
    clean_array_cov = clean_array.T.dot(
        clean_array,
    ) / (clean_array.shape[0] - 1)

    plt.figure(figsize=(10, 10))
    data_matrix, diagonal_vals = return_diagonal(
        noisy_array_cov - clean_array_cov,
    )
    plt.matshow(data_matrix > 0, fignum=1, aspect="auto")
    plt.colorbar()
    plt.show()

    cov_suspects = np.argwhere(np.array(diagonal_vals) > 0)

    try:
        if len(cov_suspects) > 1:
            cov_suspects = list(cov_suspects.squeeze())
        else:
            cov_suspects = [int(cov_suspects.squeeze())]
    except BaseException:
        pass

    return cov_suspects




[docs]
def get_suspect_features(clean_corpus, test_dataset, alpha=0.05):
    """
    > This function takes in a clean corpus and a test dataset, and returns a list of feature indices that are
    statistically different between the two

    Args:
      clean_corpus: the clean corpus
      test_dataset: the dataset you want to test for contamination
      alpha: the significance level for the KS test.

    Returns:
      The suspicious features are being returned.
    """

    from scipy.stats import ks_2samp

    np.random.seed(123456)
    suspicious_feat = []
    for feat_idx in range(clean_corpus.shape[1]):
        if ks_2samp(clean_corpus[:, feat_idx], test_dataset[:, feat_idx])[1] < alpha:
            suspicious_feat.append(feat_idx)

    return suspicious_feat




[docs]
def write_to_file(contents, filename):
    """
    > This function takes in a variable and a filename, and writes the variable to the filename as a
    pickle file.

    Args:
      contents: the data to be written to the file
      filename: the name of the file to write to
    """
    # write contents to pickle file

    with open(filename, "wb") as handle:
        pickle.dump(contents, handle)




[docs]
def read_from_file(filename):
    """
    > This function loads a file from a pickle

    Args:
      filename: the name of the file to read from

    Returns:
      the pickle file.
    """
    # load file from pickle

    return pickle.load(open(filename, "rb"))