Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| import csv | |
| import string | |
| from pathlib import Path | |
| # Detect CI environment | |
| IS_CI = os.getenv("CI", "false").lower() == "true" | |
| # Set number of samples accordingly | |
| SAMPLES_PER_CLASS = 3 if IS_CI else 300 # Reduced for CI to speed up tests | |
| # Paths | |
| CURRENT_DIR = Path(__file__).resolve().parent | |
| IMAGES_DIR = CURRENT_DIR.parent / "data" / "images" # Absolute path of images folder | |
| OUTPUT_FILE = CURRENT_DIR.parent / "data" / "emr_records.csv" | |
| # Label to triage | |
| triage_map = {"COVID": "high", "NORMAL": "low", "VIRAL PNEUMONIA": "medium"} | |
| # Shared ambiguous templates | |
| shared_symptoms = [ | |
| "Mild cough and slight fever reported.", | |
| "General fatigue and throat irritation present.", | |
| "Breathing mildly labored during physical exertion.", | |
| "No major respiratory distress; mild wheezing noted.", | |
| "Occasional chest tightness reported.", | |
| "Vital signs mostly stable; slight variation in temperature.", | |
| ] | |
| # Overlapping diagnosis clues to add ambiguity | |
| shared_diagnosis = [ | |
| "Symptoms could relate to a range of viral infections.", | |
| "Presentation not distinctly matching any single infection.", | |
| "Further tests required to confirm diagnosis.", | |
| "Findings are borderline; clinical judgment advised.", | |
| "Observation warranted due to overlapping signs.", | |
| "Initial assessment inconclusive.", | |
| ] | |
| # Noise sentences | |
| neutral_noise = [ | |
| "Patient is cooperative and alert.", | |
| "Dietary habits unremarkable.", | |
| "Hydration status normal.", | |
| "Follow-up advised if symptoms persist.", | |
| "No notable family medical history.", | |
| "No medications currently administered.", | |
| ] | |
| def random_token(): | |
| prefix = "ID" | |
| letters = "".join(random.choices(string.ascii_uppercase, k=2)) | |
| digits = "".join(random.choices(string.digits, k=2)) | |
| return f"{prefix}-{letters}{digits}" | |
| def get_oxygen(label): | |
| # Soft blur across classes | |
| if label == "NORMAL": | |
| return random.randint(94, 100) | |
| elif label == "VIRAL PNEUMONIA": | |
| return random.randint(90, 96) | |
| else: | |
| return random.randint(87, 94) | |
| def get_temp(label): | |
| if label == "NORMAL": | |
| return round(random.uniform(97.5, 99.0), 1) | |
| else: | |
| return round(random.uniform(98.8, 102.5), 1) | |
| def get_age(): | |
| return random.randint(18, 85) | |
| def get_days(): | |
| return random.randint(1, 10) | |
| def build_emr(label, i): | |
| pid = random_token() | |
| age = f"{get_age()}-year-old" | |
| days = get_days() | |
| temp = get_temp(label) | |
| oxygen = get_oxygen(label) | |
| intro = f"Patient {pid}, a {age}, reports symptoms for {days} days." | |
| vitals = f"Temperature recorded at {temp}°F and SPO2 at {oxygen}%." | |
| # Shared symptoms + blurred logic | |
| body = [ | |
| intro, | |
| random.choice(shared_symptoms), | |
| vitals, | |
| random.choice(shared_diagnosis), | |
| ] | |
| # Optionally inject a mild class-specific clue (with low probability) | |
| if random.random() < 0.3: | |
| if label == "COVID": | |
| body.append("Patient reports recent loss of taste.") | |
| elif label == "VIRAL PNEUMONIA": | |
| body.append("Chest X-ray shows scattered infiltrates.") | |
| elif label == "NORMAL": | |
| body.append("No active complaints at this time.") | |
| # Inject 1–2 noise sentences | |
| if random.random() < 0.8: | |
| body.insert(random.randint(1, len(body)), random.choice(neutral_noise)) | |
| if random.random() < 0.5: | |
| body.insert(random.randint(1, len(body)), random.choice(neutral_noise)) | |
| random.shuffle(body[1:]) # Keep intro in position 0 | |
| return " ".join(body) | |
| # Generate records | |
| def generate_dataset(image_dir_override=None, output_path_override=None): | |
| root_image_dir = image_dir_override or IMAGES_DIR | |
| output_file = output_path_override or OUTPUT_FILE | |
| # Folders | |
| categories = { | |
| "COVID": root_image_dir / "COVID", # Absolute path of Image labels | |
| "NORMAL": root_image_dir / "NORMAL", | |
| "VIRAL PNEUMONIA": root_image_dir / "VIRAL PNEUMONIA", | |
| } | |
| records = [] | |
| for label, img_dir in categories.items(): | |
| image_files = sorted( | |
| [ | |
| f | |
| for f in img_dir.glob("*") | |
| if f.suffix.lower() in [".png", ".jpg", ".jpeg"] | |
| ] | |
| ) | |
| if not image_files: | |
| raise FileNotFoundError( | |
| f"No images found in {img_dir}. Folder contents: {list(img_dir.iterdir())}") | |
| for i in range(SAMPLES_PER_CLASS): | |
| image_path = str( | |
| random.choice(image_files).relative_to(CURRENT_DIR.parent) # path of image respective to the project root | |
| ) | |
| text = build_emr(label, i) | |
| triage = triage_map[label] | |
| records.append([f"{label}-{i + 1}", image_path, text, triage]) | |
| # Shuffle + write | |
| random.shuffle(records) | |
| with open(output_file, "w", newline="") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["patient_id", "image_path", "emr_text", "triage_level"]) | |
| writer.writerows(records) | |
| print(f"✅ EMR dataset generated at {output_file}") | |
| if __name__ == "__main__": | |
| generate_dataset(image_dir_override=IMAGES_DIR, output_path_override=OUTPUT_FILE) | |