Spaces:
Sleeping
Sleeping
File size: 2,067 Bytes
37e344c af86b36 37e344c af86b36 37e344c af86b36 37e344c af86b36 37e344c af86b36 37e344c af86b36 37e344c af86b36 562137e af86b36 37e344c af86b36 37e344c af86b36 37e344c af86b36 37e344c af86b36 562137e af86b36 37e344c af86b36 37e344c af86b36 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import os
import pandas as pd
from PIL import Image
from transformers import AutoTokenizer
# Disable parallelism to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# This script preprocesses EMR data and images for a clinical NLP task.
# It loads a CSV file containing EMR records, tokenizes the text using a
# clinical BERT tokenizer, and preprocesses images for further analysis.
# Import necessary libraries
# Use a clinical tokenizer ( or basic BERT )
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
def load_data(csv_path):
"""
Load data from a CSV file.
"""
if not os.path.exists(csv_path):
raise FileNotFoundError(f"CSV file not found at {csv_path}")
df = pd.read_csv(csv_path)
return df
def preprocess_text(text):
"""
Preprocess text data.
"""
if not isinstance(text, str):
raise ValueError("Input text must be a string.")
return tokenizer(
text, truncation=True, padding="max_length", max_length=128, return_tensors="pt"
)
def preprocess_image(image_path, image_size=(224, 224)):
"""
Preprocess image data.
"""
if not os.path.exists(image_path):
raise FileNotFoundError(f"Image file not found at {image_path}")
img = Image.open(image_path).convert("RGB")
img = img.resize(image_size)
return img
if __name__ == "__main__":
base_dir = os.path.dirname(os.path.dirname(__file__))
data_path = os.path.join(base_dir, "data", "emr_records.csv")
df = load_data(data_path)
print("Data loaded successfully.")
# apply function applies to each row in the 'image_path' column and joins
# the base directory with the relative path
df["image_path"] = df["image_path"].apply(lambda p: os.path.join(base_dir, p))
print("Sample record:")
print(df.iloc[0])
text_encoding = preprocess_text(df.iloc[0]["emr_text"])
print("Tokenized EMR:")
print(text_encoding.input_ids.shape)
img = preprocess_image(df.iloc[0]["image_path"])
img.show() # Display the image
|