medi-llm / src /data_preprocessing.py
Preetham22's picture
Auto-format code with Black
562137e
import os
import pandas as pd
from PIL import Image
from transformers import AutoTokenizer
# Disable parallelism to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# This script preprocesses EMR data and images for a clinical NLP task.
# It loads a CSV file containing EMR records, tokenizes the text using a
# clinical BERT tokenizer, and preprocesses images for further analysis.
# Import necessary libraries
# Use a clinical tokenizer ( or basic BERT )
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
def load_data(csv_path):
"""
Load data from a CSV file.
"""
if not os.path.exists(csv_path):
raise FileNotFoundError(f"CSV file not found at {csv_path}")
df = pd.read_csv(csv_path)
return df
def preprocess_text(text):
"""
Preprocess text data.
"""
if not isinstance(text, str):
raise ValueError("Input text must be a string.")
return tokenizer(
text, truncation=True, padding="max_length", max_length=128, return_tensors="pt"
)
def preprocess_image(image_path, image_size=(224, 224)):
"""
Preprocess image data.
"""
if not os.path.exists(image_path):
raise FileNotFoundError(f"Image file not found at {image_path}")
img = Image.open(image_path).convert("RGB")
img = img.resize(image_size)
return img
if __name__ == "__main__":
base_dir = os.path.dirname(os.path.dirname(__file__))
data_path = os.path.join(base_dir, "data", "emr_records.csv")
df = load_data(data_path)
print("Data loaded successfully.")
# apply function applies to each row in the 'image_path' column and joins
# the base directory with the relative path
df["image_path"] = df["image_path"].apply(lambda p: os.path.join(base_dir, p))
print("Sample record:")
print(df.iloc[0])
text_encoding = preprocess_text(df.iloc[0]["emr_text"])
print("Tokenized EMR:")
print(text_encoding.input_ids.shape)
img = preprocess_image(df.iloc[0]["image_path"])
img.show() # Display the image