import os import pandas as pd from PIL import Image from transformers import AutoTokenizer # Disable parallelism to avoid warnings os.environ["TOKENIZERS_PARALLELISM"] = "false" # This script preprocesses EMR data and images for a clinical NLP task. # It loads a CSV file containing EMR records, tokenizes the text using a # clinical BERT tokenizer, and preprocesses images for further analysis. # Import necessary libraries # Use a clinical tokenizer ( or basic BERT ) tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") def load_data(csv_path): """ Load data from a CSV file. """ if not os.path.exists(csv_path): raise FileNotFoundError(f"CSV file not found at {csv_path}") df = pd.read_csv(csv_path) return df def preprocess_text(text): """ Preprocess text data. """ if not isinstance(text, str): raise ValueError("Input text must be a string.") return tokenizer( text, truncation=True, padding="max_length", max_length=128, return_tensors="pt" ) def preprocess_image(image_path, image_size=(224, 224)): """ Preprocess image data. """ if not os.path.exists(image_path): raise FileNotFoundError(f"Image file not found at {image_path}") img = Image.open(image_path).convert("RGB") img = img.resize(image_size) return img if __name__ == "__main__": base_dir = os.path.dirname(os.path.dirname(__file__)) data_path = os.path.join(base_dir, "data", "emr_records.csv") df = load_data(data_path) print("Data loaded successfully.") # apply function applies to each row in the 'image_path' column and joins # the base directory with the relative path df["image_path"] = df["image_path"].apply(lambda p: os.path.join(base_dir, p)) print("Sample record:") print(df.iloc[0]) text_encoding = preprocess_text(df.iloc[0]["emr_text"]) print("Tokenized EMR:") print(text_encoding.input_ids.shape) img = preprocess_image(df.iloc[0]["image_path"]) img.show() # Display the image