Source code for climb.tool.impl.tool_data_suite

import math
import os
from pathlib import Path
from typing import Any, Dict

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from ..tool_comms import ToolCommunicator, ToolReturnIter, execute_tool
from ..tools import ToolBase
from .data_suite.models.conformal import conformal_class
from .data_suite.models.copula import fit_sample_copula
from .data_suite.models.representation import compute_representation
from .data_suite.utils.helpers import inlier_outlier_dicts, sort_cis_synth



[docs]
def clean_dataframe(df, unique_threshold=15):
    # Identify column data types
    inferred_categorical_columns = []
    inferred_numerical_columns = []
    inferred_boolean_columns = []

    for col in df.columns:
        unique_values = df[col].dropna().unique()  # Drop NA to get unique values
        num_unique_values = len(unique_values)

        if df[col].dtype == "bool":
            inferred_boolean_columns.append(col)
        elif num_unique_values < unique_threshold or df[col].dtype == "object":
            inferred_categorical_columns.append(col)
        elif pd.api.types.is_numeric_dtype(df[col]):
            inferred_numerical_columns.append(col)
        else:
            # Handle mixed or unexpected data types
            try:
                df[col] = pd.to_numeric(df[col], errors="coerce")
                inferred_numerical_columns.append(col)
            except ValueError:
                inferred_categorical_columns.append(col)

    numerical_columns = [
        col
        for col in inferred_numerical_columns
        if col not in inferred_categorical_columns and col not in inferred_boolean_columns
    ]
    categorical_columns = inferred_categorical_columns
    boolean_columns = inferred_boolean_columns

    # Convert categorical columns to category indices
    for col in categorical_columns:
        df[col] = pd.Categorical(df[col]).codes

    # Clean numerical columns
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        # Handle missing values - example: fill with the median
        df[col] = df[col].fillna(df[col].median())

    # Convert boolean columns to integers
    for col in boolean_columns:
        df[col] = df[col].astype(int)

    return df




[docs]
def data_suite_insights(
    tc: ToolCommunicator,
    data_file_path: str,
    target_column: str,
    workspace: str,  # pylint: disable=unused-argument
) -> None:
    """data_suite

    Args:
        tc (ToolCommunicator): The tool communicator object.
        data_file_path (str): The path to the input CSV file.
        workspace (str): The workspace directory path.
    """
    workspace = Path(workspace)  # pyright: ignore
    # Load the data
    df = pd.read_csv(data_file_path)
    df = clean_dataframe(df)

    # Shuffle the data
    df = df.sample(frac=1)
    df.reset_index(drop=True, inplace=True)

    X = df.drop(columns=[target_column])
    y = df[target_column]
    all_features = list(X.columns)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.3,
        random_state=42,
    )

    # Use alias test and train
    if isinstance(X_train, pd.DataFrame):
        X_train = X_train.values
    train = X_train
    if isinstance(X_test, pd.DataFrame):
        X_test = X_test.values
    test = X_test

    # define suspect_features as all features
    suspect_features = list(range(train.shape[1]))

    # define parameters for the copula and representer
    copula_n_samples = 1000
    rep_type = "pca"

    # Step 1: fit and sample a copula on the dataset.
    copula_samples = fit_sample_copula(
        clean_corpus=train,
        copula="gauss",
        copula_n_samples=copula_n_samples,
    )

    # Step 2: REPRESENTER - learns a low dimensional representation of the data. The representation dimension is half, but can be adjusted as a hyperparameter
    rep_dim = int(np.ceil(train.shape[1] / 2))
    pcs_train, pcs_test, pcs_copula = compute_representation(
        train,
        test,
        copula_samples,
        n_components=rep_dim,
        rep_type=rep_type,
    )

    # Step 3: CONFORMAL PREDICTOR - a feature-wise conformal predictor is fit and each reconstruction assessed
    conformal_dict = {}
    for feat in suspect_features:
        feat = int(feat)
        dim = pcs_copula.shape[1]
        conf = conformal_class(conformity_score="sign", input_dim=dim)
        conf.fit(x_train=pcs_copula, y_train=copula_samples[:, feat])
        conformal_dict[feat] = conf.predict(x_test=pcs_test, y_test=test[:, feat])
        tc.print(f"Running analysis for feature = {feat}")

    # Step 4: PROCESS CONFORMAL INTERVALS - we need to process the intervals
    inliers_dict, outliers_dict = inlier_outlier_dicts(conformal_dict, suspect_features)
    # Define the threshold for the proportion of inliers
    proportion = 0.4
    small_ci_ids, large_ci_ids, df_sorted = sort_cis_synth(
        conformal_dict, inliers_dict, suspect_features=[0], proportion=proportion
    )

    if len(large_ci_ids) == 0:
        tc.set_returns(
            tool_return=("The model is performing well on all the data points."),
        )
    else:
        df_large_ci = df.loc[large_ci_ids]

        cluster_model = KMeans(n_clusters=math.ceil(math.sqrt(len(large_ci_ids))))
        cluster_model.fit(df_large_ci)
        df_large_ci["Cluster"] = cluster_model.labels_
        # Calculate mean of each feature for each cluster
        cluster_means = df_large_ci.groupby("Cluster")[all_features].mean()
        cluster_means.reset_index(inplace=False).to_csv(workspace / "Data_suite_examples_to_collect.csv", index=False)  # pyright: ignore

        tc.set_returns(
            tool_return=(
                f"The following features have large conformal intervals and these are the data points that the model may perform poorly on: {large_ci_ids}.\n"
                f"Here are records that approximates the data points that the model may perform poorly on: {cluster_means}.\n"
                f"It is therefore advised that you collect more records that are similar to the example above in order to improve the model's performance.\n"
                f"The exemplar records have also been saved to the workspace directory as 'Data_suite_examples_to_collect.csv'."
            ),
        )




[docs]
class DataSuiteInsights(ToolBase):
    def _execute(self, **kwargs: Any) -> ToolReturnIter:
        real_path = os.path.join(self.working_directory, kwargs["data_file_path"])
        thrd, out_stream = execute_tool(
            data_suite_insights,
            wd=self.working_directory,
            data_file_path=real_path,
            target_column=kwargs["target_column"],
            workspace=self.working_directory,
        )
        self.tool_thread = thrd
        return out_stream

    @property
    def name(self) -> str:
        return "data_suite_insights"

    @property
    def description(self) -> str:
        return """
        Uses the data_suite_insights tool to gain insights regions of the dataset that the model may perform poorly on.
        """

    @property
    def specification(self) -> Dict[str, Any]:
        return {
            "type": "function",
            "function": {
                "name": self.name,
                "description": self.description,
                "parameters": {
                    "type": "object",
                    "properties": {
                        "data_file_path": {"type": "string", "description": "Path to the data file."},
                        "target_column": {"type": "string", "description": "Name of the target column."},
                    },
                    "required": ["data_file_path", "target_column"],
                },
            },
        }

    @property
    def description_for_user(self) -> str:
        return """
Uses the data_suite_insights tool to gain insights regions of the dataset that the model may perform poorly on.
The tool provides exemplar records that the user may want to collect more records similar to in order to improve the model's performance.
"""