Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| from PIL import Image | |
| from transformers import AutoTokenizer | |
| # Disable parallelism to avoid warnings | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| # This script preprocesses EMR data and images for a clinical NLP task. | |
| # It loads a CSV file containing EMR records, tokenizes the text using a | |
| # clinical BERT tokenizer, and preprocesses images for further analysis. | |
| # Import necessary libraries | |
| # Use a clinical tokenizer ( or basic BERT ) | |
| tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT") | |
| def load_data(csv_path): | |
| """ | |
| Load data from a CSV file. | |
| """ | |
| if not os.path.exists(csv_path): | |
| raise FileNotFoundError(f"CSV file not found at {csv_path}") | |
| df = pd.read_csv(csv_path) | |
| return df | |
| def preprocess_text(text): | |
| """ | |
| Preprocess text data. | |
| """ | |
| if not isinstance(text, str): | |
| raise ValueError("Input text must be a string.") | |
| return tokenizer( | |
| text, truncation=True, padding="max_length", max_length=128, return_tensors="pt" | |
| ) | |
| def preprocess_image(image_path, image_size=(224, 224)): | |
| """ | |
| Preprocess image data. | |
| """ | |
| if not os.path.exists(image_path): | |
| raise FileNotFoundError(f"Image file not found at {image_path}") | |
| img = Image.open(image_path).convert("RGB") | |
| img = img.resize(image_size) | |
| return img | |
| if __name__ == "__main__": | |
| base_dir = os.path.dirname(os.path.dirname(__file__)) | |
| data_path = os.path.join(base_dir, "data", "emr_records.csv") | |
| df = load_data(data_path) | |
| print("Data loaded successfully.") | |
| # apply function applies to each row in the 'image_path' column and joins | |
| # the base directory with the relative path | |
| df["image_path"] = df["image_path"].apply(lambda p: os.path.join(base_dir, p)) | |
| print("Sample record:") | |
| print(df.iloc[0]) | |
| text_encoding = preprocess_text(df.iloc[0]["emr_text"]) | |
| print("Tokenized EMR:") | |
| print(text_encoding.input_ids.shape) | |
| img = preprocess_image(df.iloc[0]["image_path"]) | |
| img.show() # Display the image | |