Source code for climb.tool.impl.tool_feature_selection

import os
from typing import Any, Dict, Optional

import numpy as np
import pandas as pd
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder

from ..tool_comms import ToolCommunicator, ToolReturnIter, execute_tool
from ..tools import ToolBase


[docs] def feature_selection( tc: ToolCommunicator, data_file_path: str, workspace: str, # pylint: disable=unused-argument task_type: str, target_column: str, time_column: Optional[str] = None, ) -> None: tc.print("Setting up feature pruner...") if task_type == "survival_analysis" and time_column is None: raise ValueError('`time_column` was not provided, but it is required for "survival_analysis" `task_type`.') df = pd.read_csv(data_file_path) # check the percentage of nans in the dataset nan_percentage = df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) if nan_percentage > 0.1: tc.print( "The dataset has more than 10% missing values. Please impute the missing values with a tool like HyperImpute before running this tool." ) tc.set_returns( "The dataset has more than 10% missing values. Please impute the missing values with a tool like HyperImpute before running this tool." ) return elif nan_percentage > 0: # Impute missing values with s simple approach as nans to too few to have a significant impact # impute categorical columns with the mode categorical_columns = df.select_dtypes(include=["object"]).columns for col in categorical_columns: df[col] = df[col].fillna(df[col].mode()[0]) # impute all other columns with median try: df = df.fillna(df.median()) except TypeError: df = df.fillna(df.mean(numeric_only=True)) if len(df) > 100000: # HACK: limit the number of rows to 100,000 for speed. Could run multiple subsets instead? df = df.sample(n=100000, random_state=1) if task_type == "survival_analysis": # convert to classification using time horizon time_horizon = df[time_column].median() df["TEMP_CLASSIFICATION_FROM_SURVIVAL_EVENT_COL"] = np.where( (df[time_column] < time_horizon) & (df[target_column] == 1), 1, 0 ) df.drop(time_column, axis=1, inplace=True) df.drop(target_column, axis=1, inplace=True) # change target column to the new classification column target_column = "TEMP_CLASSIFICATION_FROM_SURVIVAL_EVENT_COL" task_type = "classification" # convert categorical columns to ordinal enc = OrdinalEncoder() categorical_columns = df.select_dtypes(include=["object"]).columns for col in categorical_columns: df[col] = df[col].astype(str) if df[col].nunique() > 1: # Only encode columns with more than one unique value # Reshape and transform: df[col] = enc.fit_transform(df[col].values.reshape(-1, 1)) # pyright: ignore else: df[col] = 1.0 # If only one unique value, set to 1.0 y = df[target_column] X = df.drop(target_column, axis=1) if task_type == "classification": rfc = RandomForestClassifier(random_state=1, n_estimators=1000, max_depth=5) elif task_type == "regression": rfc = RandomForestRegressor(random_state=1, n_estimators=1000, max_depth=5) else: raise ValueError(f"Invalid task type: {task_type}") boruta_selector = BorutaPy( rfc, n_estimators="auto", # type: ignore verbose=0, random_state=1, ) boruta_selector.fit(np.array(X.values), np.array(y.values)) important_features = X.columns[boruta_selector.support_].to_list() # type: ignore if len(important_features) == 0: important_features = X.columns.to_list() out_message = f"Here are the selected features: {', '.join(important_features)}" tc.print(out_message) tc.set_returns(out_message)
[docs] class BorutaFeatureSelection(ToolBase): def _execute(self, **kwargs: Any) -> ToolReturnIter: real_path = os.path.join(self.working_directory, kwargs["data_file_path"]) thrd, out_stream = execute_tool( feature_selection, wd=self.working_directory, data_file_path=real_path, task_type=kwargs["task_type"], target_column=kwargs["target_column"], time_column=kwargs.get("time_column"), workspace=self.working_directory, ) self.tool_thread = thrd return out_stream @property def name(self) -> str: return "feature_selection" @property def description(self) -> str: return """ Uses an **automated feature selection** library to suggest the most important features in the dataset. Users may want to use this tool to drop the features that are not in the list of important features. Reducing the number of features in their dataset, which can help to reduce overfitting and improve the performance of machine learning models. They may not want to drop all unimportant features, as they may have domain knowledge that suggests that some features are important. """ @property def specification(self) -> Dict[str, Any]: return { "type": "function", "function": { "name": self.name, "description": self.description, "parameters": { "type": "object", "properties": { "data_file_path": {"type": "string", "description": "Path to the data file."}, "task_type": { "type": "string", "description": "The type of task that the user is working on. This can one of the following: 'classification', 'regression', 'survival_analysis'.", }, "target_column": { "type": "string", "description": "The target column to predict in the research task. For survival analysis this should be the event column.", }, "time_column": { "type": "string", "description": "The time to event column. This is only applicable for survival analysis tasks, where it is mandatory.", }, }, "required": ["data_file_path", "task_type", "target_column"], }, }, } @property def description_for_user(self) -> str: return "uses the **automated feature selection** library to automatically find the most important features in your data."