Source code for climb.tool.impl.tool_data_centric

import os
import pickle
from typing import Any, Dict

import numpy as np
import pandas as pd
import plotly.express as px
import xgboost as xgb
from autoprognosis.utils.serialization import load_model_from_file
from data_iq import DataIQ_SKLearn

from ..tool_comms import ToolCommunicator, ToolReturnIter, execute_tool
from ..tools import ToolBase



[docs]
def dataiq_insights(
    tc: ToolCommunicator,
    data_file_path: str,
    target_variable: str,
    model_path: str,  # TODO: remove.
    workspace: str,
) -> None:
    df = pd.read_csv(data_file_path)

    # Convert "object" columns to categorical columns.
    df = clean_dataframe(df)

    X = df[[c for c in df.columns if c != target_variable]]
    y = df[target_variable].to_numpy()

    tc.print("Loading the model...")
    try:
        model = load_model_from_file(model_path)  # noqa: F841
    except Exception as e:
        raise TypeError(
            "Model file is not a valid AutoPrognosis 2.0 file. This tool only supports AutoPrognosis 2.0 models."
        ) from e

    nest = 100
    clf = xgb.XGBClassifier(n_estimators=nest)
    clf.fit(X, y)

    tc.print("Running DataIQ...")
    dataiq = DataIQ_SKLearn(X=X, y=y)
    for i in range(1, nest):
        dataiq.on_epoch_end(clf=clf, iteration=i)
    aleatoric_uncertainty = dataiq.aleatoric
    confidence = dataiq.confidence

    # Determine easy/hard/ambiguous samples.
    tc.print("Determining easy/hard/ambiguous samples...")
    # NOTE: The thresholds here are heuristics.
    percentile_thresh = 90  # Originally 50
    thresh = max(
        0.25,
        (np.max(confidence) - np.min(confidence)) * 0.25 + np.min(confidence),
    )  # Originally just 0.25
    conf_thresh_low = thresh
    conf_thresh_high = 1 - thresh

    hard_train = np.where(
        (confidence <= conf_thresh_low)
        & (aleatoric_uncertainty <= np.percentile(aleatoric_uncertainty, percentile_thresh))
    )[0]
    easy_train = np.where(
        (confidence >= conf_thresh_high)
        & (aleatoric_uncertainty <= np.percentile(aleatoric_uncertainty, percentile_thresh))
    )[0]

    hard_easy = np.concatenate((hard_train, easy_train))
    ambig_train_ = []
    for id_ in range(len(confidence)):
        if id_ not in hard_easy:
            ambig_train_.append(id_)

    ambig_train = np.array(ambig_train_)

    # Save the results.
    tc.print("Saving the results...")
    results = {
        "aleatoric_uncertainty": aleatoric_uncertainty,
        "confidence": confidence,
        "easy_samples": easy_train,
        "hard_samples": hard_train,
        "ambiguous_samples": ambig_train,
        "df": df,
        "target_variable": target_variable,
    }
    results_path = os.path.join(workspace, "dataiq_results.p")
    with open(results_path, "wb") as f:
        pickle.dump(results, f)

    # Load in all the data.
    tc.print("Preparing the plot...")
    easy_samples = results["easy_samples"]
    hard_samples = results["hard_samples"]
    ambiguous_samples = results["ambiguous_samples"]
    aleatoric_uncertainty = results["aleatoric_uncertainty"]
    confidence = results["confidence"]
    df = results["df"]
    target_variable = results["target_variable"]
    features = [c for c in df.columns if c != target_variable]

    # Count the different groups and put into text.
    easy_count = len(easy_samples)
    hard_count = len(hard_samples)
    ambiguous_count = len(ambiguous_samples)
    counts = f"Easy: {easy_count} samples\nHard: {hard_count} samples\nAmbiguous: {ambiguous_count} samples"

    # Put into a dataframe for plotly.
    df["aleatoric_uncertainty"] = aleatoric_uncertainty
    df["confidence"] = confidence
    df["data_iq_group"] = None
    df.loc[easy_samples, "data_iq_group"] = "Easy"
    df.loc[ambiguous_samples, "data_iq_group"] = "Ambiguous"
    df.loc[hard_samples, "data_iq_group"] = "Hard"
    df["Row Index"] = np.arange(len(df))

    hover_show = features + ["Row Index", target_variable]

    # Downsample df if more than 1000 samples for plotting.
    if len(df) > 1000:
        df_fig = df.sample(n=1000, random_state=42)
        print("Downsampled to 1000 samples for plotting to avoid UI issues.")
    else:
        df_fig = df

    fig = px.scatter(
        df_fig,
        x="aleatoric_uncertainty",
        y="confidence",
        color="data_iq_group",
        color_discrete_sequence=["#27ae60", "#f8c471", "#e74c3c"],
        hover_data=hover_show,
        title="Data-IQ Insights",
        labels={
            "aleatoric_uncertainty": "Aleatoric Uncertainty",
            "confidence": "Confidence",
            "data_iq_group": "Data-IQ Group",
        },
        width=1000,
        height=800,
    )

    # The below text is left-aligned to avoid strange formatting in the UI.

    # Additional explanation.
    explanation_of_axes = """
In DataIQ the two axes are:
- **Confidence** represents the model's confidence in the predictions
- **Aleatoric uncertainty** is the inherent uncertainty or ambiguity in a sample. That is, even if we add more \
samples, we'd still be confused. For example, in a tabular dataset, two patients with the same features but \
different labels.
"""
    explanation_of_groups = """
In DataIQ, the samples are categorized into three groups:
- **Easy**: Samples which we are confident about and are clear-cut (low data uncertainty), hence easy to learn.
- **Ambiguous**: Samples for which there is just uncertainty in the data itself. For instance, for tabular data \
in the medical setting, this could be: "the only way we'll get better predictions is if we get more information, \
e.g. patients for whom we need to run more tests).
- **Hard**: Possibly mislabeled data. Since these are really clear-cut samples (low data uncertainty), but the \
model just can't learn them, i.e. samples which are unlearnable in their current state.
"""

    tc.set_returns(
        tool_return=f"""
Results saved to: `{results_path}`.
This is a pickle file containing a dictionary with keys:
{{
"easy_samples": numpy array with indices of easy samples,
"hard_samples": numpy array with indices of hard samples,
"ambiguous_samples": numpy array with indices of ambiguous samples,
}}

Sample counts summary:
{counts}

{explanation_of_axes}

{explanation_of_groups}

The user can now explore different groups of samples on an interactive chart.
""",
        user_report=[
            fig,
            f"""
**Additional explanation:**

{explanation_of_axes}

{explanation_of_groups}
""",
        ],
    )




[docs]
class DataIQInsights(ToolBase):
    def _execute(self, **kwargs: Any) -> ToolReturnIter:
        real_data_path = os.path.join(self.working_directory, kwargs["data_file_path"])
        real_model_path = os.path.join(self.working_directory, kwargs["model_path"])
        target_variable = kwargs["target_variable"]
        thrd, out_stream = execute_tool(
            dataiq_insights,
            wd=self.working_directory,
            data_file_path=real_data_path,
            target_variable=target_variable,
            model_path=real_model_path,
            workspace=self.working_directory,
        )
        self.tool_thread = thrd
        return out_stream

    @property
    def name(self) -> str:
        return "dataiq_insights"

    @property
    def description(self) -> str:
        return """
        DataIQ Insights for a classification task.

        Given a dataset and a classifier model, this tool provides data-centric insights for a classification task.
        In particular it categorizes the samples into "easy", "hard" and "ambiguous" for classification. 
        """

    @property
    def specification(self) -> Dict[str, Any]:
        return {
            "type": "function",
            "function": {
                "name": self.name,
                "description": self.description,
                "parameters": {
                    "type": "object",
                    "properties": {
                        "data_file_path": {"type": "string", "description": "Path to the data file."},
                        "target_variable": {"type": "string", "description": "Name of the target variable."},
                        "model_path": {"type": "string", "description": "Path to the model file."},
                    },
                },
            },
        }

    @property
    def description_for_user(self) -> str:
        return (
            "provides insights for your classification task - which samples were 'easy', 'hard' or 'ambiguous' "
            "for classification."
        )



# TODO: abstract this into a shared module

[docs]
def clean_dataframe(df, unique_threshold=15):
    # Identify column data types
    inferred_categorical_columns = []
    inferred_numerical_columns = []
    inferred_boolean_columns = []

    for col in df.columns:
        unique_values = df[col].dropna().unique()  # Drop NA to get unique values
        num_unique_values = len(unique_values)

        if df[col].dtype == "bool":
            inferred_boolean_columns.append(col)
        elif num_unique_values < unique_threshold or df[col].dtype == "object":
            inferred_categorical_columns.append(col)
        elif pd.api.types.is_numeric_dtype(df[col]):
            inferred_numerical_columns.append(col)
        else:
            # Handle mixed or unexpected data types
            try:
                df[col] = pd.to_numeric(df[col], errors="coerce")
                inferred_numerical_columns.append(col)
            except ValueError:
                inferred_categorical_columns.append(col)

    numerical_columns = [
        col
        for col in inferred_numerical_columns
        if col not in inferred_categorical_columns and col not in inferred_boolean_columns
    ]
    categorical_columns = inferred_categorical_columns
    boolean_columns = inferred_boolean_columns

    # Convert categorical columns to category indices
    for col in categorical_columns:
        df[col] = pd.Categorical(df[col]).codes

    # Clean numerical columns
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")
        # Handle missing values - example: fill with the median
        df[col].fillna(df[col].median(), inplace=True)

    # Convert boolean columns to integers
    for col in boolean_columns:
        df[col] = df[col].astype(int)

    return df