File size: 2,067 Bytes
37e344c
af86b36
37e344c
 
 
af86b36
 
37e344c
 
af86b36
 
 
37e344c
 
 
 
af86b36
37e344c
 
 
 
 
 
af86b36
37e344c
 
 
af86b36
37e344c
 
 
 
 
 
af86b36
562137e
af86b36
 
37e344c
 
 
 
 
 
 
 
af86b36
37e344c
 
 
af86b36
37e344c
 
af86b36
37e344c
 
af86b36
 
562137e
af86b36
37e344c
 
af86b36
 
37e344c
 
af86b36
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import pandas as pd
from PIL import Image
from transformers import AutoTokenizer

# Disable parallelism to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# This script preprocesses EMR data and images for a clinical NLP task.
# It loads a CSV file containing EMR records, tokenizes the text using a
# clinical BERT tokenizer, and preprocesses images for further analysis.
# Import necessary libraries

# Use a clinical tokenizer ( or basic BERT )
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


def load_data(csv_path):
    """
    Load data from a CSV file.
    """
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found at {csv_path}")

    df = pd.read_csv(csv_path)
    return df


def preprocess_text(text):
    """
    Preprocess text data.
    """
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")
    return tokenizer(
        text, truncation=True, padding="max_length", max_length=128, return_tensors="pt"
    )


def preprocess_image(image_path, image_size=(224, 224)):
    """
    Preprocess image data.
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file not found at {image_path}")

    img = Image.open(image_path).convert("RGB")
    img = img.resize(image_size)
    return img


if __name__ == "__main__":
    base_dir = os.path.dirname(os.path.dirname(__file__))
    data_path = os.path.join(base_dir, "data", "emr_records.csv")
    df = load_data(data_path)
    print("Data loaded successfully.")
    # apply function applies to each row in the 'image_path' column and joins
    # the base directory with the relative path
    df["image_path"] = df["image_path"].apply(lambda p: os.path.join(base_dir, p))
    print("Sample record:")
    print(df.iloc[0])

    text_encoding = preprocess_text(df.iloc[0]["emr_text"])
    print("Tokenized EMR:")
    print(text_encoding.input_ids.shape)

    img = preprocess_image(df.iloc[0]["image_path"])
    img.show()  # Display the image