Source code for climb.tool.impl.tool_feature_extraction_from_text

import json
import os
from pathlib import Path
from typing import Any, Dict

import pandas as pd
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm  # Optional: For progress visualization

from ..tool_comms import ToolCommunicator, ToolReturnIter, execute_tool
from ..tools import ToolBase


[docs] def feature_extraction_from_text( tc: ToolCommunicator, data_file_path: str, extracted_data_file_path: str, topics_dict: str, workspace: str, # pylint: disable=unused-argument ) -> None: """ Extract specified categorical topics from free-text fields in a pandas DataFrame. Parameters: - data_file_path (str): Path to the input CSV file. - extracted_data_file_path (str): Path to the output CSV file with extracted features. - topics_dict (str): A nested dictionary where keys are free-text column names, and values are dictionaries mapping topics to their synonyms. e.g. topics_dict = { "column1": { "topic1": ["synonym1", "synonym2"], "topic2": ["synonym3", "synonym4"] }, "column2": { "topic1": ["synonym1", "synonym2"], "topic3": ["synonym5", "synonym6"] }, } - workspace (str): The path to the workspace directory. """ # Load spaCy model with disabled components for speed try: nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) except OSError: # If the model is not found, download it from spacy.cli import download # pyright: ignore download("en_core_web_sm") nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) # Load the data workspace = Path(workspace) # pyright: ignore data_file_path = workspace / data_file_path # pyright: ignore extracted_data_file_path = workspace / extracted_data_file_path # pyright: ignore df = pd.read_csv(data_file_path) # Create a copy of the DataFrame to avoid modifying the original df_original = df.copy() df = clean_dataframe(df) # Initialize the Matcher matcher = Matcher(nlp.vocab) # Dictionary to map matcher IDs to topic names matcher_id_to_topic = {} # Convert topics_dict from JSON string to Python dictionary topics_dict = json.loads(topics_dict) tc.print(f"Extracting topics from free text fields in the DataFrame using these concepts: \n{topics_dict}") # Initialize a dictionary to count the number of matches per field field_match_count = {field: 0 for field in topics_dict.keys()} # pyright: ignore # Iterate through each specified field in topics_dict for field, topics in topics_dict.items(): # pyright: ignore tc.print(f"\nProcessing field: '{field}'") # Check if the field exists in the DataFrame if field not in df.columns: tc.print(f"Warning: Field '{field}' not found in DataFrame.") continue # Initialize new columns for each topic for topic, synonyms in topics.items(): sanitized_topic = topic.replace(" ", "_") column_name = f"{field}_{sanitized_topic}" df[column_name] = 0 # Binary indicator # Create patterns based on lemmas and lowercase synonyms for synonym in synonyms: doc = nlp(synonym.lower()) pattern = [] for token in doc: pattern.append({"LEMMA": token.lemma_}) matcher.add(sanitized_topic, [pattern]) matcher_id_to_topic[matcher.vocab.strings[sanitized_topic]] = sanitized_topic # pyright: ignore # Process texts with tqdm for progress visualization tc.print(f"\nExtracting topics from {field}...") for idx, text in tqdm(df[field].items(), desc="Processing texts"): # Ensure the text is a string if not isinstance(text, str): text = str(text) # Process the text with spaCy doc = nlp(text) # Find matches in the text matches = matcher(doc) # Set the corresponding topic columns to 1 if any synonym is found for match_id, start, end in matches: string_id = nlp.vocab.strings[match_id] # Get string representation column_name = f"{field}_{string_id}" df.at[idx, column_name] = 1 # Mark presence # Increment the match count for the field field_match_count[field] += len(matches) # Identify all new feature columns based on topics_dict new_feature_columns = [] for field, topics in topics_dict.items(): # pyright: ignore for topic in topics.keys(): sanitized_topic = topic.replace(" ", "_") column_name = f"{field}_{sanitized_topic}" if column_name in df.columns: new_feature_columns.append(column_name) # Add the new feature columns to df_original for column in new_feature_columns: if column in df.columns: df_original[column] = df[column] # Drop the original free text fields from df_original df_original.drop(columns=[field for field in topics_dict.keys() if field in df_original.columns], inplace=True) # pyright: ignore # Save the final DataFrame df_original.to_csv(extracted_data_file_path, index=False) match_count_string = "\n".join( [f"Number of matches found in '{field}': {count}" for field, count in field_match_count.items()] ) tc.set_returns( tool_return=( f"Features extracted from free text." f"\n\n{match_count_string}\n\n" f"The new dataset with extracted features has been saved to {extracted_data_file_path}" ), files_in=[os.path.basename(data_file_path)], files_out=[os.path.basename(extracted_data_file_path)], )
[docs] class FeatureExtractionFromText(ToolBase): def _execute(self, **kwargs: Any) -> ToolReturnIter: real_path = os.path.join(self.working_directory, kwargs["data_file_path"]) out_path = os.path.join(self.working_directory, kwargs["extracted_data_file_path"]) thrd, out_stream = execute_tool( feature_extraction_from_text, wd=self.working_directory, data_file_path=real_path, extracted_data_file_path=out_path, topics_dict=kwargs["topics_dict"], workspace=self.working_directory, ) self.tool_thread = thrd return out_stream @property def name(self) -> str: return "feature_extraction_from_text" @property def description(self) -> str: return """ Uses the `feature_extraction_from_text` tool to extract the features from free text fields. """ @property def specification(self) -> Dict[str, Any]: return { "type": "function", "function": { "name": self.name, "description": self.description, "parameters": { "type": "object", "properties": { "data_file_path": {"type": "string", "description": "Path to the data file."}, "extracted_data_file_path": { "type": "string", "description": "Path to the data file with extracted features, which this function creates.", }, "topics_dict": { "type": "string", "description": """ A json formatted string structured as a nested dictionary where keys are free-text column names, and values are dictionaries mapping topics to their synonyms. The \ synonyms should be a list of the top ten words associated with the topic. The dictionary should be structured as follows, ```json { "column1": { "topic1": ["synonym1", "synonym2", ...], "topic2": ["synonym3", "synonym4", ...] }, "column2": { "topic1": ["synonym1", "synonym2", ...], "topic3": ["synonym5", "synonym6", ...] }, } ``` """, }, }, "required": ["data_file_path", "extracted_data_file_path", "topics_dict"], }, }, } @property def description_for_user(self) -> str: return "Uses an LLM to extract the features from free text fields."
[docs] def clean_dataframe(df, unique_threshold=15): # Identify column data types inferred_categorical_columns = [] inferred_numerical_columns = [] inferred_boolean_columns = [] for col in df.columns: unique_values = df[col].dropna().unique() # Drop NA to get unique values num_unique_values = len(unique_values) if df[col].dtype == "bool": inferred_boolean_columns.append(col) elif num_unique_values < unique_threshold or df[col].dtype == "object": inferred_categorical_columns.append(col) elif pd.api.types.is_numeric_dtype(df[col]): inferred_numerical_columns.append(col) else: # Handle mixed or unexpected data types try: df[col] = pd.to_numeric(df[col], errors="coerce") inferred_numerical_columns.append(col) except ValueError: inferred_categorical_columns.append(col) numerical_columns = [ col for col in inferred_numerical_columns if col not in inferred_categorical_columns and col not in inferred_boolean_columns ] categorical_columns = inferred_categorical_columns boolean_columns = inferred_boolean_columns # Convert categorical columns to category indices for col in categorical_columns: df[col] = pd.Categorical(df[col]).codes # Clean numerical columns for col in numerical_columns: df[col] = pd.to_numeric(df[col], errors="coerce") # Convert boolean columns to integers for col in boolean_columns: df[col] = df[col].astype(int) return df