Source code for climb.tool.impl.tool_exploratory_data_analysis

import os
import time
from typing import Any, Dict, List, Optional, Tuple

import matplotlib
import matplotlib.figure
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from .._utils import id_numerics_actually_categoricals
from ..tool_comms import ToolCommunicator, ToolReturnIter, execute_tool
from ..tools import ToolBase, get_str_up_to_marker



[docs]
def generate_correlogram(
    df: pd.DataFrame,
    numerics_that_are_categoricals: List[str],
    workspace: str,
    show_n_corr: int = 10,  # Number of most correlated pairs to show.
    target: Optional[str] = None,
) -> Tuple[str, matplotlib.figure.Figure]:
    # Convert likely categorical numerics to categorical type:
    for col in numerics_that_are_categoricals:
        df[col] = pd.Categorical(df[col]).codes

    # Select only numerical columns
    num_df = df.select_dtypes(include=[np.number])

    # Compute the correlation matrix
    corr_matrix = num_df.corr()

    # Flatten the matrix to vector and sort by absolute value
    corr_unstacked = corr_matrix.abs().unstack()
    sorted_corr = corr_unstacked.sort_values(kind="quicksort", ascending=False)  # type: ignore

    # Include target feature in the top correlations if specified
    if target and target in num_df.columns:
        target_corr = corr_unstacked[target].drop(target).abs().sort_values(ascending=False).head(show_n_corr)
        sorted_corr = pd.concat([sorted_corr, target_corr]).drop_duplicates()

    # Select the top most correlated pairs
    top_correlations = sorted_corr.drop_duplicates().iloc[1 : show_n_corr + 1]  # skip the first (self-correlation)

    # Find the features involved in these top correlations
    features = set([item for sublist in top_correlations.index for item in sublist])

    # Ensure target is included in features set if specified
    if target and target in num_df.columns:
        features.add(target)

    # Filter the correlation matrix to include only these features
    filtered_corr_matrix = corr_matrix.filter(items=features).reindex(features)  # type: ignore

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(filtered_corr_matrix, dtype=bool))

    # Set up the matplotlib figure
    plt.figure(figsize=(11, 9))

    # Draw the heatmap with the mask and the smaller set of features
    sns.heatmap(
        filtered_corr_matrix,
        mask=mask,
        cmap="coolwarm",
        vmax=1.0,
        vmin=-1.0,
        annot=True,
        fmt=".2f",
        square=True,
        linewidths=0.5,
        cbar_kws={"shrink": 0.8},
        annot_kws={"size": 8},
    )

    # Add a title
    plt.title("Correlation Matrix of Top Correlated Features")

    # Save the figure
    fig = plt.gcf()
    plt.savefig(os.path.join(workspace, "correlogram.png"))
    plt.close()

    # Create a DataFrame from the upper triangle of the correlation matrix, without the diagonal.
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Stack the DataFrame and reset index.
    stacked_corr = upper_tri.stack().reset_index()
    stacked_corr.columns = ["Feature 1", "Feature 2", "Correlation"]

    # Sort by absolute values of correlation, descending.
    sorted_corr = stacked_corr.reindex(stacked_corr.Correlation.abs().sort_values(ascending=False).index)

    # Extract the top most positively correlated feature pairs.
    most_positive_corr = sorted_corr[sorted_corr["Correlation"] > 0].head(show_n_corr).reset_index(drop=True)

    # Extract the top most negatively correlated feature pairs.
    most_negative_corr = sorted_corr[sorted_corr["Correlation"] < 0].head(show_n_corr).reset_index(drop=True)

    # Combine and return the most correlated pairs.
    as_text = f"""
Most Positively Correlated Features:
{most_positive_corr}

Most Negatively Correlated Features:
{most_negative_corr}
"""

    return as_text, fig




[docs]
def exploratory_data_analysis(
    tc: ToolCommunicator,
    data_file_path: str,
    target: Optional[str],
    workspace: str,
) -> None:
    """Perform exploratory data analysis (EDA) on a CSV file, outputting a detailed textual summary.

    Key features:
    1. Dataset Overview:
        Reports the dataset's dimensions and column data types.
    2. Numerical Feature Analysis:
        Provides statistics (mean, median...), including skewness and kurtosis, to detail numerical data distribution.
    3. Categorical Variable Analysis:
        Lists unique counts, top and rare categories, aiding in the assessment of categorical data distribution.
    4. Missing Values Analysis:
        Identifies and counts missing values per column, essential for data cleaning.
    5. Correlation Analysis:
        Calculates most (anti-)correlated features, creates a correlogram.
    6. Outliers Identification:
        Detects outliers using IQR, reporting counts and bounds, crucial for data quality assessment.
    7. Duplicate Records Analysis:
        Checks and reports the count of duplicate records, important for ensuring data integrity.

    Args:
        tc (ToolCommunicator): tool communicator object.
        data_file_path (str): path to the data file.
        target (str): target feature name.

    Returns:
        str: Detailed EDA report.
    """
    df = pd.read_csv(data_file_path)
    analysis_summary = ""

    # Dataset basic info
    tc.print("Getting dataset basic info...")
    time.sleep(0.4)
    analysis_summary += f"Dataset Shape: {df.shape[0]} rows and {df.shape[1]} columns\n"
    analysis_summary += f"Column Names and Types:\n{df.dtypes.to_string()}\n\n"

    # Enhanced Descriptive statistics for numerical features
    tc.print("Getting descriptive statistics for numerical features...")
    analysis_summary += "Descriptive Statistics for Numerical Features:\n"
    numerical_stats = df.describe(include=[np.number])
    numerical_stats.loc["skew"] = df.select_dtypes(include=[np.number]).skew()  # type: ignore
    numerical_stats.loc["kurt"] = df.select_dtypes(include=[np.number]).kurtosis()  # type: ignore
    analysis_summary += f"{numerical_stats.to_string()}\n\n"

    # Detailed information on categorical variables
    tc.print("Getting detailed information on categorical variables...")
    time.sleep(0.4)
    numerics_that_are_categoricals = id_numerics_actually_categoricals(df)
    numerics_that_are_categoricals_info = (
        "Identified numeric value columns that should most likely be considered categoricals:"
        f"\n{numerics_that_are_categoricals}.\n"
        "This is done by checking whether the column contains only integers and "
        "has a low number of unique values (<20 or <5% of total examples).\n"
    )
    categorical_columns = df.select_dtypes(include=["object"]).columns
    categorical_columns = list(set(categorical_columns).union(set(numerics_that_are_categoricals)))
    analysis_summary += f"{numerics_that_are_categoricals_info}\n"
    analysis_summary += "Detailed Information on Categorical Variables:\n"
    for col in categorical_columns:
        analysis_summary += (
            f"{col} - Unique Values: {df[col].nunique()} \nTop 5 Values:\n{df[col].value_counts().head().to_string()}"
        )
        if df[col].nunique() > 5:
            analysis_summary += f"\nRare Categories:\n{df[col].value_counts().tail(5).to_string()}\n\n"
        else:
            analysis_summary += "\n\n"

    # Missing values analysis
    tc.print("Performing missing values analysis...")
    time.sleep(0.4)
    missing_values = df.isnull().sum()
    analysis_summary += "Missing Values Analysis:\n"
    if missing_values.any():
        analysis_summary += f"{missing_values[missing_values > 0].to_string()}\n\n"
    else:
        analysis_summary += "No missing values found.\n\n"
    # Count all NaN rows
    all_nan_rows = df.isna().all(axis=1).sum()
    # Count all NaN columns
    all_nan_columns = df.isna().all(axis=0).sum()
    if all_nan_rows > 0:
        analysis_summary += f"Count of rows with all NaN values: {all_nan_rows}\n"
    if all_nan_columns > 0:
        analysis_summary += f"Count of columns with all NaN values: {all_nan_columns}\n"

    # Correlation analysis
    tc.print("Performing correlation analysis...")
    time.sleep(0.4)
    # Old code, for info:
    # analysis_summary += "Correlation Analysis (Numerical Features):\n"
    # if df.shape[1] <= 15:
    #     analysis_summary += f"{df.select_dtypes(include=[np.number]).corr().to_string()}\n\n"  # type: ignore
    # else:
    #     analysis_summary += "Too many columns to calculate correlations.\n\n"
    try:
        correlogram_text, correlogram_fig = generate_correlogram(
            df,
            numerics_that_are_categoricals=numerics_that_are_categoricals,
            show_n_corr=10,
            workspace=workspace,
            target=target,
        )
        analysis_summary += "Correlation Analysis:\n"
        analysis_summary += correlogram_text
    except Exception:
        analysis_summary += "There was a problem generating the correlogram, skipped."
        correlogram_text = ""
        correlogram_fig = None

    # Potential outliers identification with more details
    tc.print("Performing potential outliers identification...")
    time.sleep(0.4)
    analysis_summary += "\nOutlier Identification for Numerical Features:\n"
    for col in df.select_dtypes(include=[np.number]).columns:  # type: ignore
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.75 * IQR
        upper_bound = Q3 + 1.75 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        analysis_summary += (
            f"{col} - Outliers Count: {outliers.shape[0]}\n[Lower Bound: {lower_bound:.3g}, "
            f"Upper Bound: {upper_bound:.3g}]\n"
        )

    # Duplicate records analysis
    tc.print("Performing duplicate records analysis...")
    time.sleep(0.4)
    duplicates = df.duplicated().sum()
    analysis_summary += f"\nDuplicate Records: {duplicates}\n\n"

    if correlogram_fig is not None:
        user_report = [
            "Here is a correlogram showing the correlation between features:",
            correlogram_fig,
            """A correlogram is a visual tool that shows the relationships between different variables (or features) \
in a dataset. It presents a grid of color-coded squares, where each square represents the strength and direction \
of the relationship between two variables. Brighter or darker colors indicate stronger relationships. \
Positive relationships (where variables increase together) and negative relationships (where one variable increases \
as the other decreases) are shown with different colors. Here we use reds to denote positive, and blue to denote \
negative relationships. This makes it easy to see which pairs of variables are related, and how closely they are \
connected.
""",
        ]
    else:
        user_report = None
    tc.set_returns(
        tool_return=analysis_summary,
        user_report=user_report,
    )




[docs]
class ExploratoryDataAnalysis(ToolBase):
    def _execute(self, **kwargs: Any) -> ToolReturnIter:
        real_path = os.path.join(self.working_directory, kwargs["data_file_path"])
        target = kwargs.get("target", None)
        thrd, out_stream = execute_tool(
            exploratory_data_analysis,
            wd=self.working_directory,
            data_file_path=real_path,
            workspace=self.working_directory,
            target=target,
        )
        self.tool_thread = thrd
        return out_stream

    @property
    def name(self) -> str:
        return "EDA"

    @property
    def description(self) -> str:
        return get_str_up_to_marker(exploratory_data_analysis.__doc__, "Args")  # type: ignore

    @property
    def specification(self) -> Dict[str, Any]:
        return {
            "type": "function",
            "function": {
                "name": self.name,
                "description": self.description,
                "parameters": {
                    "type": "object",
                    "properties": {
                        "data_file_path": {"type": "string", "description": "Path to the data file."},
                        "target": {
                            "type": "string",
                            "description": "Target feature name.",
                            "default": None,
                        },
                    },
                    "required": ["data_file_path"],
                },
            },
        }

    @property
    def description_for_user(self) -> str:
        return "performs exploratory data analysis on your data, providing a summary of its characteristics."