Spaces:

wi-lab
/

dataset-distancing-lab

Sleeping

App Files Files Community

wi-lab commited on Oct 17, 2025

Commit

f0949fa

1 Parent(s): b6e8a86

Create app.py

Browse files

Files changed (1) hide show

app.py +339 -0

app.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import os
+import io
+import json
+from typing import List, Dict, Tuple, Optional
+import numpy as np
+import torch
+import gradio as gr
+from huggingface_hub import snapshot_download, HfFolder
+# =========================
+# Helpers: logging + tokens
+# =========================
+DEFAULT_MODEL_REPO = "wi-lab/lwm-v1.1"
+DEFAULT_MODEL_DIR  = "./LWM-v1.1"
+def ensure_hf_token():
+    tok = os.getenv("HF_TOKEN", None)
+    if tok:
+        HfFolder.save_token(tok)
+    return tok
+def log_md(msg: str) -> str:
+    return f"{msg}"
+# =========================
+# Dataset loading utilities
+# =========================
+def _load_pt_bytes(b: bytes) -> Dict[str, torch.Tensor]:
+    # Expect a dict with at least "channels". Optional: "labels"
+    buf = io.BytesIO(b)
+    obj = torch.load(buf, map_location="cpu")
+    if isinstance(obj, dict) and "channels" in obj:
+        return {
+            "channels": obj["channels"],
+            "labels": obj.get("labels", None)
+        }
+    # Fallback: if it’s a tensor
+    if torch.is_tensor(obj):
+        return {"channels": obj, "labels": None}
+    raise ValueError("PT file must contain a dict with 'channels' (and optional 'labels'), or a tensor.")
+def _load_npy_bytes(b: bytes) -> Dict[str, torch.Tensor]:
+    buf = io.BytesIO(b)
+    arr = np.load(buf, allow_pickle=True)
+    # If it's an array directly
+    if isinstance(arr, np.ndarray):
+        t = torch.from_numpy(arr)
+        return {"channels": t, "labels": None}
+    # If it's a dict-like (rare for npy)
+    raise ValueError("NPY must contain a single ndarray (channels). For dict-like, use NPZ.")
+def _load_npz_bytes(b: bytes) -> Dict[str, torch.Tensor]:
+    buf = io.BytesIO(b)
+    npz = np.load(buf, allow_pickle=True)
+    # Expect either keys "channels" and optional "labels", or fallback to first array
+    if "channels" in npz:
+        ch = npz["channels"]
+        labs = npz["labels"] if "labels" in npz else None
+        return {
+            "channels": torch.from_numpy(ch),
+            "labels": (torch.from_numpy(labs) if labs is not None else None)
+        }
+    # Fallback: take the first array in the file
+    keys = list(npz.keys())
+    if not keys:
+        raise ValueError("Empty NPZ.")
+    ch = npz[keys[0]]
+    return {"channels": torch.from_numpy(ch), "labels": None}
+def parse_uploaded_datasets(files: List[gr.File]) -> Dict[int, Dict[str, torch.Tensor]]:
+    """
+    Accepts multiple files. Each becomes one dataset.
+    Supported:
+      - .pt / .pth (torch.save)
+      - .npy (single array)
+      - .npz (expects 'channels' and optional 'labels', else uses first array)
+    Output: {0: {'channels': Tensor[N, ...], 'labels': Optional[Tensor[N]]}, 1: {...}, ...}
+    """
+    datasets = {}
+    idx = 0
+    for f in files or []:
+        name = f.name or ""
+        data = f.read()
+        try:
+            if name.endswith((".pt", ".pth")):
+                ds = _load_pt_bytes(data)
+            elif name.endswith(".npy"):
+                ds = _load_npy_bytes(data)
+            elif name.endswith(".npz"):
+                ds = _load_npz_bytes(data)
+            else:
+                raise ValueError(f"Unsupported file type: {name}")
+            # Ensure tensors are float and shaped as [N, ...]
+            ch = ds["channels"]
+            if ch.ndim == 1:
+                ch = ch.unsqueeze(0)  # [1, D]
+            ds["channels"] = ch
+            datasets[idx] = ds
+            idx += 1
+        except Exception as e:
+            raise ValueError(f"Failed to load '{name}': {e}")
+    return datasets
+# =========================
+# Distance backends (stubs)
+# =========================
+def _to_feature_matrix(chs: torch.Tensor) -> torch.Tensor:
+    """
+    Flatten per-sample, split complex into [real, imag], return [N, D] float32
+    """
+    if chs.ndim >= 3:
+        chs = chs.reshape(chs.shape[0], -1)  # [N, ...] -> [N, D]
+    elif chs.ndim == 2:
+        pass  # already [N, D]
+    else:
+        chs = chs.view(chs.shape[0], -1)
+    if torch.is_complex(chs):
+        chs = torch.cat([chs.real, chs.imag], dim=1)
+    return chs.to(torch.float32)
+def _pad_to_same_dim(mats: List[torch.Tensor]) -> List[torch.Tensor]:
+    max_d = max(m.shape[1] for m in mats)
+    out = []
+    for m in mats:
+        if m.shape[1] < max_d:
+            pad = torch.zeros((m.shape[0], max_d - m.shape[1]), dtype=m.dtype)
+            m = torch.cat([m, pad], dim=1)
+        out.append(m)
+    return out
+def compute_distance_matrix_raw(
+    datasets: Dict[int, Dict[str, torch.Tensor]],
+    n_per_dataset: int,
+    distance_mode: str,
+    sw_num_projections: int,
+    label_aware: bool,
+    label_weighting: str,
+    label_max_per_class: int
+) -> torch.Tensor:
+    """
+    Minimal RAW baseline: centroid L2 or cosine. SW is not implemented here (stub).
+    """
+    mats = []
+    for i in sorted(datasets.keys()):
+        ch = datasets[i]["channels"]
+        n = min(n_per_dataset, ch.shape[0]) if n_per_dataset else ch.shape[0]
+        idxs = torch.randperm(ch.shape[0])[:n]
+        X = _to_feature_matrix(ch[idxs])
+        mats.append(X)
+    mats = _pad_to_same_dim(mats)
+    cents = [M.mean(dim=0, keepdim=True) for M in mats]
+    C = torch.cat(cents, dim=0)  # [D, Df]
+    if distance_mode == "cosine_similarity":
+        Cn = torch.nn.functional.normalize(C, dim=1)
+        D = 1.0 - (Cn @ Cn.T)
+    else:
+        # "euclidean_centroid" and default fallback
+        D = torch.cdist(C, C, p=2)
+    return D
+def compute_distance_matrix_umap(
+    datasets: Dict[int, Dict[str, torch.Tensor]],
+    umap_kwargs: dict,
+    channel_representation: str,
+    angle_delay_bins: int,
+    n_per_dataset: int,
+    distance_mode: str,
+    sw_num_projections: int,
+    label_aware: bool,
+    label_weighting: str,
+    label_max_per_class: int
+) -> torch.Tensor:
+    """
+    Placeholder: for now, reuse RAW. Swap in your UMAP pipeline later.
+    """
+    return compute_distance_matrix_raw(
+        datasets, n_per_dataset, distance_mode, sw_num_projections, label_aware, label_weighting, label_max_per_class
+    )
+def compute_distance_matrix_lwm(
+    datasets: Dict[int, Dict[str, torch.Tensor]],
+    model_dir: str,
+    n_per_dataset: int,
+    distance_mode: str,
+    sw_num_projections: int,
+    label_aware: bool,
+    label_weighting: str,
+    label_max_per_class: int
+) -> torch.Tensor:
+    """
+    Placeholder: for now, reuse RAW. Replace with your LWM-embedding code that loads
+    the backbone from model_dir and computes pairwise distances from embeddings.
+    """
+    return compute_distance_matrix_raw(
+        datasets, n_per_dataset, distance_mode, sw_num_projections, label_aware, label_weighting, label_max_per_class
+    )
+# =========================
+# HF Model fetch (ONLY LWM)
+# =========================
+def fetch_lwm_model(model_repo: str, local_dir: str) -> str:
+    os.makedirs(local_dir, exist_ok=True)
+    ensure_hf_token()
+    snapshot_download(
+        repo_id=model_repo,
+        local_dir=local_dir,
+        local_dir_use_symlinks=False,
+    )
+    return f"Downloaded model repo: **{model_repo}** → `{local_dir}`"
+# =========================
+# UI callbacks
+# =========================
+def on_fetch_model(model_repo: str, model_dir: str):
+    try:
+        model_repo = model_repo.strip() or DEFAULT_MODEL_REPO
+        model_dir  = model_dir.strip() or DEFAULT_MODEL_DIR
+        msg = fetch_lwm_model(model_repo, model_dir)
+        return gr.update(value=model_dir), log_md(msg)
+    except Exception as e:
+        return gr.update(value=model_dir), log_md(f"**Error**: {e}")
+def on_compute(
+    files: List[gr.File],
+    framework: str,
+    distance_mode: str,
+    n_per_dataset: int,
+    sw_num_projections: int,
+    label_aware: bool,
+    label_weighting: str,
+    label_max_per_class: int,
+    model_dir: str,
+    umap_mode: str,
+    umap_n_components: int,
+    umap_n_neighbors: int,
+    umap_min_dist: float,
+    channel_representation: str,
+    angle_delay_bins: int
+):
+    try:
+        datasets = parse_uploaded_datasets(files)
+        if len(datasets) < 2:
+            return None, log_md("Please upload **≥ 2** datasets.")
+        if framework == "RAW":
+            D = compute_distance_matrix_raw(
+                datasets, int(n_per_dataset), distance_mode, int(sw_num_projections),
+                label_aware, label_weighting, int(label_max_per_class)
+            )
+        elif framework == "UMAP":
+            umap_kwargs = dict(
+                n_components=int(umap_n_components),
+                n_neighbors=int(umap_n_neighbors),
+                min_dist=float(umap_min_dist),
+                metric="euclidean",
+                random_state=42,
+            )
+            D = compute_distance_matrix_umap(
+                datasets, umap_kwargs, channel_representation, int(angle_delay_bins),
+                int(n_per_dataset), distance_mode, int(sw_num_projections),
+                label_aware, label_weighting, int(label_max_per_class)
+            )
+        else:  # LWM
+            if not model_dir or not os.path.isdir(model_dir):
+                return None, log_md("LWM selected but **model dir** not found. Click *Fetch LWM model* first.")
+            D = compute_distance_matrix_lwm(
+                datasets, model_dir, int(n_per_dataset), distance_mode, int(sw_num_projections),
+                label_aware, label_weighting, int(label_max_per_class)
+            )
+        Dnp = D.detach().cpu().numpy().astype(float)
+        headers = [f"D{i}" for i in range(Dnp.shape[0])]
+        table = [[round(x, 6) for x in row] for row in Dnp]
+        return gr.update(value=table, headers=headers, row_count=(len(table), "fixed")), log_md("Done.")
+    except Exception as e:
+        return None, log_md(f"**Error**: {e}")
+# =========================
+# Gradio App
+# =========================
+with gr.Blocks(title="Dataset Distancing Lab") as demo:
+    gr.Markdown("# **Dataset Distancing Lab**  \nUpload multiple datasets and compute similarity via **LWM / UMAP / RAW**.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 1) Upload datasets (≥ 2)")
+            files_in = gr.File(file_count="multiple", label="Upload .pt/.pth/.npy/.npz", type="binary")
+            gr.Markdown("### 2) Choose framework & options")
+            framework_dd = gr.Radio(choices=["RAW", "UMAP", "LWM"], value="RAW", label="Framework")
+            distance_mode_dd = gr.Radio(
+                choices=["sliced_wasserstein", "euclidean_centroid", "cosine_similarity"],
+                value="euclidean_centroid", label="Distance mode"
+            )
+            n_per_ds_in = gr.Number(value=1024, precision=0, label="n_per_dataset (sampling)")
+            sw_proj_in  = gr.Number(value=64, precision=0, label="SW num projections")
+            label_aware_cb = gr.Checkbox(value=True, label="Label-aware")
+            label_weighting_dd = gr.Radio(choices=["uniform", "support"], value="uniform", label="Label weighting")
+            label_max_in = gr.Number(value=1e10, precision=0, label="Label max per class")
+            with gr.Accordion("UMAP options", open=False):
+                umap_mode_dd = gr.Radio(choices=["unsupervised", "supervised"], value="supervised", label="UMAP mode")
+                umap_dim = gr.Slider(2, 256, value=128, step=1, label="UMAP n_components")
+                umap_knn = gr.Slider(2, 100, value=32, step=1, label="UMAP n_neighbors")
+                umap_min = gr.Slider(0.0, 0.99, value=0.1, step=0.01, label="UMAP min_dist")
+                chan_repr = gr.Radio(choices=["raw", "angle_delay"], value="angle_delay", label="Channel representation")
+                ad_bins   = gr.Slider(4, 64, value=16, step=1, label="Angle-delay bins")
+            compute_btn = gr.Button("Compute distance matrix")
+            gr.Markdown("---")
+            gr.Markdown("### (Optional) Fetch LWM-v1.1 model")
+            model_repo_in = gr.Textbox(label="Model repo (HF)", value=DEFAULT_MODEL_REPO)
+            model_dir_in  = gr.Textbox(label="Local model dir", value=DEFAULT_MODEL_DIR)
+            fetch_btn     = gr.Button("Fetch LWM model")
+            fetch_status  = gr.Markdown()
+        with gr.Column(scale=1):
+            gr.Markdown("### Distance Matrix")
+            matrix_out = gr.Dataframe(headers=[], value=None, interactive=False, wrap=True, row_count=(0, "dynamic"))
+            run_status = gr.Markdown()
+    fetch_btn.click(on_fetch_model, inputs=[model_repo_in, model_dir_in], outputs=[model_dir_in, fetch_status])
+    compute_btn.click(
+        on_compute,
+        inputs=[
+            files_in, framework_dd, distance_mode_dd, n_per_ds_in, sw_proj_in,
+            label_aware_cb, label_weighting_dd, label_max_in, model_dir_in,
+            umap_mode_dd, umap_dim, umap_knn, umap_min, chan_repr, ad_bins
+        ],
+        outputs=[matrix_out, run_status]
+    )
+if __name__ == "__main__":
+    demo.launch()