Spaces:

Perunio
/

galis

Sleeping

App Files Files Community

Perunio commited on Sep 15

Commit

7dba0b0

1 Parent(s): 42dd08a

final app

Browse files

Files changed (6) hide show

galis_app.py +133 -72
llm/related_work_generator.py +13 -30
model/mlp.py +0 -137
model/paper_similarity.py +22 -27
model/simple_gcn_model.py +0 -37
model/train.py +0 -139

galis_app.py CHANGED Viewed

@@ -2,7 +2,10 @@ from pathlib import Path
 import streamlit as st
 from dataset.ogbn_link_pred_dataset import OGBNLinkPredDataset
 from model.paper_similarity import PaperSimilarityFinder
-from llm.related_work_generator import generate_related_work
 @st.cache_resource
@@ -17,72 +20,116 @@ def load_similarity_finder():
         model_name=model_name,
         embeddings_cache_path=embeddings_dir,
     )
-    return similarity_finder, dataset
 def format_top_k_predictions_from_similarity(similar_papers: list) -> str:
     markdown_list = []
     for i, (idx, score, text) in enumerate(similar_papers):
-        title = text.split('\n')[0].strip()
-        markdown_list.append(f"{i + 1}. **{title}** (Similarity: {score:.4f})")
     return "\n".join(markdown_list)
 def app():
     st.set_page_config(page_title="Galis", layout="wide")
     st.title("Galis")
     if "references" not in st.session_state:
-        st.session_state.references = None
     if "related_work" not in st.session_state:
-        st.session_state.related_work = None
     if "abstract_title" not in st.session_state:
         st.session_state.abstract_title = ""
     if "abstract_text" not in st.session_state:
         st.session_state.abstract_text = ""
-    similarity_finder, dataset = load_similarity_finder()
     col1, col2 = st.columns(2, gap="large")
-    with col2:
-        references_placeholder = st.empty()
-        related_work_placeholder = st.empty()
     with col1:
         st.header("Abstract Title")
-        abstract_title = st.text_input(
-            "Paste your title here",
-            st.session_state.abstract_title,
-            key="abstract_title_input",
-            label_visibility="collapsed",
         )
         st.header("Abstract Text")
-        abstract_input = st.text_area(
             "Paste your abstract here",
-            st.session_state.abstract_text,
-            key="abstract_text_input",
-            height=100,
             label_visibility="collapsed",
         )
-        st.write("...or **upload** a .txt file (first line = title, rest = abstract)")
-        uploaded_file = st.file_uploader(
-            "Drag and drop file here", type=["txt"], help="Limit 200MB per file • TXT"
         )
-        if uploaded_file is not None:
-            content = uploaded_file.getvalue().decode("utf-8").splitlines()
-            st.session_state.abstract_title = content[0] if content else ""
-            st.session_state.abstract_text = (
-                "\n".join(content[1:]) if len(content) > 1 else ""
-            )
-            st.rerun()
-        st.session_state.abstract_title = abstract_title
-        st.session_state.abstract_text = abstract_input
         num_citations = st.number_input(
             "Number of suggestions",
             min_value=1,
@@ -93,49 +140,63 @@ def app():
         )
         if st.button("Suggest References and related work", type="primary"):
-            if not abstract_title.strip() or not abstract_input.strip():
                 st.warning("Please provide both a title and an abstract.")
             else:
-                st.session_state.references = None
-                st.session_state.related_work = None
-                references_placeholder.empty()
-                related_work_placeholder.empty()
-                with st.spinner("Analyzing abstract and predicting references..."):
-                    similar_papers = similarity_finder.find_similar_papers(
-                        title=abstract_title,
-                        abstract=abstract_input,
-                        top_k=num_citations
-                    )
-                    references = format_top_k_predictions_from_similarity(similar_papers)
-                    st.session_state.references = references
-                with references_placeholder.container():
-                    st.header("Suggested References")
-                    with st.container(height=200):
-                        st.markdown(st.session_state.references, unsafe_allow_html=True)
-                with related_work_placeholder.container():
-                    with st.spinner("Generating related work section..."):
-                        related_work = generate_related_work(
-                            st.session_state.abstract_title,
-                            st.session_state.abstract_text,
-                            st.session_state.references,
-                        )
-                        st.session_state.related_work = related_work
-    if st.session_state.references:
-        with references_placeholder.container():
             st.header("Suggested References")
-            with st.container(height=200):
-                st.markdown(st.session_state.references, unsafe_allow_html=True)
-    if st.session_state.related_work:
-        with related_work_placeholder.container():
             st.header("Suggested Related Works")
-            with st.container(height=200):
-                st.markdown(st.session_state.related_work, unsafe_allow_html=True)
 if __name__ == "__main__":
-    app()

 import streamlit as st
 from dataset.ogbn_link_pred_dataset import OGBNLinkPredDataset
 from model.paper_similarity import PaperSimilarityFinder
+from llm.related_work_generator import (
+    generate_related_work,
+    create_related_work_pipeline,
+)
 @st.cache_resource
         model_name=model_name,
         embeddings_cache_path=embeddings_dir,
     )
+    pipeline = create_related_work_pipeline()
+    return pipeline, similarity_finder, dataset
 def format_top_k_predictions_from_similarity(similar_papers: list) -> str:
     markdown_list = []
     for i, (idx, score, text) in enumerate(similar_papers):
+        title = text.split("\n")[0].strip()
+        markdown_list.append(f"{i + 1}. {title} (Similarity: {score:.4f})")
     return "\n".join(markdown_list)
+def process_uploaded_file():
+    try:
+        uploaded_file = st.session_state.file_uploader
+        if uploaded_file is not None:
+            content = uploaded_file.getvalue().decode("utf-8").splitlines()
+            st.session_state.abstract_title = content[0] if content else ""
+            st.session_state.abstract_text = (
+                "\n".join(content[1:]) if len(content) > 1 else ""
+            )
+    except Exception as e:
+        st.error(f"Error processing file: {e}")
+GALIS_DESCRIPTION = """
+        ### About GALIS
+        **GALIS** is a web-based application designed to streamline and improve the creation of related work and
+        references sections for research papers. It leverages an existing semantic graph that captures the
+        relationships and core concepts among cited papers to guide language model outputs.
+        ### Objective
+        The primary objective is to provide a practical tool that helps researchers generate high-quality, coherent
+        related work and references sections, making the process of synthesizing literature more efficient and
+        insightful.
+        ---
+        ### How to Use GALIS
+        #### Option 1: Manual Input
+        1. **Enter your paper title** in the "Abstract Title" field
+        2. **Paste your abstract** in the "Abstract Text" area
+        3. **Set the number of suggestions** you want (1-100 papers)
+        4. **Click "Suggest References and related work"**
+        #### Option 2: File Upload
+        1. **Prepare a .txt file** with:
+           - **First line**: Your paper title
+           - **Remaining lines**: Your abstract text
+        2. **Upload the file** using the file uploader
+        3. **Set the number of suggestions** you want (1-100 papers)
+        4. **Click "Suggest References and related work"**
+        #### What You'll Get
+        - **Suggested References**: A curated list of relevant papers based on semantic similarity
+        - **Related Work Section**: An automatically generated related work section that synthesizes the suggested
+        papers
+        - **Regeneration Option**: You can regenerate the related work section if needed
+        ---
+        *Note: File uploads are limited to 200MB and must be in .txt format*
+        """
 def app():
     st.set_page_config(page_title="Galis", layout="wide")
     st.title("Galis")
+    with st.popover("What is Galis?"):
+        st.markdown(GALIS_DESCRIPTION)
     if "references" not in st.session_state:
+        st.session_state.references = ""
     if "related_work" not in st.session_state:
+        st.session_state.related_work = ""
     if "abstract_title" not in st.session_state:
         st.session_state.abstract_title = ""
     if "abstract_text" not in st.session_state:
         st.session_state.abstract_text = ""
+    pipeline, similarity_finder, dataset = load_similarity_finder()
     col1, col2 = st.columns(2, gap="large")
     with col1:
         st.header("Abstract Title")
+        st.text_input(
+            "Paste your title here", key="abstract_title", label_visibility="collapsed"
         )
         st.header("Abstract Text")
+        st.text_area(
             "Paste your abstract here",
+            key="abstract_text",
+            height=150,
             label_visibility="collapsed",
         )
+        st.file_uploader(
+            "Upload a .txt file here (first line = title, rest = abstract)",
+            type=["txt"],
+            help="Limit 200MB per file • TXT",
+            key="file_uploader",
+            on_change=process_uploaded_file,
         )
         num_citations = st.number_input(
             "Number of suggestions",
             min_value=1,
         )
         if st.button("Suggest References and related work", type="primary"):
+            if (
+                not st.session_state.abstract_title.strip()
+                or not st.session_state.abstract_text.strip()
+            ):
                 st.warning("Please provide both a title and an abstract.")
             else:
+                st.session_state.references = "LOADING"
+                st.session_state.related_work = ""
+    with col2:
+        if st.session_state.references == "LOADING":
+            with st.spinner("Analyzing abstract and predicting references..."):
+                similar_papers = similarity_finder.find_similar_papers(
+                    title=st.session_state.abstract_title,
+                    abstract=st.session_state.abstract_text,
+                    top_k=num_citations,
+                )
+                st.session_state.references = format_top_k_predictions_from_similarity(
+                    similar_papers
+                )
+                st.session_state.related_work = "LOADING"
+                st.rerun()
+        if st.session_state.references not in ["", "LOADING"]:
             st.header("Suggested References")
+            st.text_area(
+                "References",
+                value=st.session_state.references,
+                height=150,
+                label_visibility="collapsed",
+                key="ref_output",
+            )
             st.header("Suggested Related Works")
+            if st.session_state.related_work == "LOADING":
+                with st.spinner("Generating related work section..."):
+                    st.session_state.related_work = generate_related_work(
+                        pipeline,
+                        st.session_state.abstract_title,
+                        st.session_state.abstract_text,
+                        st.session_state.references,
+                    )
+                    st.rerun()
+            else:
+                st.text_area(
+                    "Related Works",
+                    value=st.session_state.related_work,
+                    height=300,
+                    label_visibility="collapsed",
+                    key="rw_output",
+                )
+            if st.button("Regenerate Related Works"):
+                st.session_state.related_work = "LOADING"
+                st.rerun()
 if __name__ == "__main__":
+    app()

llm/related_work_generator.py CHANGED Viewed

@@ -59,8 +59,8 @@ the novelty and importance of the user's project.
 Use appropriate terminology and focus on concepts, methods, and challenges relevant to that particular field of study.
 7.  **Output Format:** Generate only the text for the "Related Work" section. Do not include headers like
-"INSTRUCTIONS," "PAPER TITLE," or "PROVIDED CITATIONS" in the final output. The entire response should be the
-section text itself, ready to be inserted into an academic paper.
 """
@@ -74,16 +74,10 @@ def check_api_key():
 def create_related_work_pipeline():
-    """Creates a ready-to-use pipeline for generating the Related Work section."""
-    llm = ChatGoogleGenerativeAI(
-        model="gemini-2.0-flash-exp",
-        temperature=0.3
-    )
     prompt = PromptTemplate(
-        input_variables=["title", "abstract", "citations"],
-        template=PROMPT_TEXT
     )
     parser = StrOutputParser()
@@ -93,24 +87,12 @@ def create_related_work_pipeline():
     return chain
-def generate_related_work(title:str, abstract:str, citations_text: str) -> str:
-    """
-    Main function - pass title, abstract, and citations, get Related Work
-    Args:
-        title: The paper's title
-        abstract: The paper's abstract
-        citations_text: Text with citations (can be a list or a string)
-    Returns:
-        The generated Related Work section
-    """
-    pipeline = create_related_work_pipeline()
-    result = pipeline.invoke({
-        "title": title,
-        "abstract": abstract,
-        "citations": citations_text
-    })
     return result
@@ -141,11 +123,12 @@ Top 5 Citation Predictions:
     print("-" * 50)
     try:
-        related_work = generate_related_work(title, abstract, citations)
         print(related_work)
     except Exception as e:
         print(f"Error: {e}")
         print("1. Create a .env file in the same folder as the script")
         print("2. Add the line: GOOGLE_API_KEY=your_key")
         print("3. Get the key at: https://makersuite.google.com/app/apikey")
-        check_api_key()

 Use appropriate terminology and focus on concepts, methods, and challenges relevant to that particular field of study.
 7.  **Output Format:** Generate only the text for the "Related Work" section. Do not include headers like
+"INSTRUCTIONS" "PAPER TITLE", "RELATED WORK" or "PROVIDED CITATIONS" in the final output. Do not use markdown syntax.
+The entire response should be the section text itself, ready to be inserted into an academic paper.
 """
 def create_related_work_pipeline():
+    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=0.3)
     prompt = PromptTemplate(
+        input_variables=["title", "abstract", "citations"], template=PROMPT_TEXT
     )
     parser = StrOutputParser()
     return chain
+def generate_related_work(
+    pipeline, title: str, abstract: str, citations_text: str
+) -> str:
+    result = pipeline.invoke(
+        {"title": title, "abstract": abstract, "citations": citations_text}
+    )
     return result
     print("-" * 50)
     try:
+        pipeline = create_related_work_pipeline()
+        related_work = generate_related_work(pipeline, title, abstract, citations)
         print(related_work)
     except Exception as e:
         print(f"Error: {e}")
         print("1. Create a .env file in the same folder as the script")
         print("2. Add the line: GOOGLE_API_KEY=your_key")
         print("3. Get the key at: https://makersuite.google.com/app/apikey")
+        check_api_key()

model/mlp.py DELETED Viewed

@@ -1,137 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from sklearn.metrics import roc_auc_score, average_precision_score
-import numpy as np
-from dataset.ogbn_link_pred_dataset import (
-    OGBNLinkPredDataset,
-    OGBNLinkPredNegDataset,
-    # OGBNLinkPredNegDataset2,
-)
-from pathlib import Path
-from sentence_transformers import SentenceTransformer
-import argparse
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-BATCH_SIZE = 2048
-NUM_EPOCHS = 50
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--custom-neg", action=argparse.BooleanOptionalAction, default=False
-    )
-    parser.add_argument(
-        "--bert-embed", action=argparse.BooleanOptionalAction, default=False
-    )
-    return parser.parse_args()
-# --- Feature builder ---
-def edge_features(emb, ei):
-    u, v = ei
-    eu, ev = emb[u], emb[v]
-    return torch.cat([eu * ev, torch.abs(eu - ev)], dim=1)
-# --- Simple MLP ---
-class PairMLP(nn.Module):
-    def __init__(self, in_dim, hidden=256):
-        super().__init__()
-        self.fc1 = nn.Linear(in_dim, hidden)
-        self.fc2 = nn.Linear(hidden, 1)
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        return self.fc2(x).squeeze(-1)
-# --- Training loop ---
-def run_epoch(data, train=True):
-    model.train(train)
-    total_loss = 0
-    idx = (
-        torch.randperm(data.edge_label.size(0))
-        if train
-        else torch.arange(data.edge_label.size(0))
-    )
-    for i in range(0, len(idx), BATCH_SIZE):
-        batch_end = min(i + BATCH_SIZE, data.edge_label.size(0))
-        batch_idx = idx[i:batch_end]
-        feats = edge_features(emb, data.edge_label_index[:, batch_idx]).to(DEVICE)
-        labels = data.edge_label[batch_idx].float().to(DEVICE)
-        scores = model(feats)
-        loss = F.binary_cross_entropy_with_logits(scores, labels)
-        if train:
-            opt.zero_grad()
-            loss.backward()
-            opt.step()
-        total_loss += loss.item() * len(batch_idx)
-    return total_loss / len(idx)
-@torch.no_grad()
-def evaluate(data):
-    scores_all, labels_all = [], []
-    for i in range(0, data.edge_label.size(0), BATCH_SIZE):
-        batch_end = min(i + BATCH_SIZE, data.edge_label.size(0))
-        feats = edge_features(emb, data.edge_label_index[:, i:batch_end]).to(DEVICE)
-        labels = data.edge_label[i : i + BATCH_SIZE]
-        scores = torch.sigmoid(model(feats)).cpu().numpy()
-        scores_all.append(scores)
-        labels_all.append(labels.numpy())
-    y_scores = np.concatenate(scores_all)
-    y_true = np.concatenate(labels_all)
-    return roc_auc_score(y_true, y_scores), average_precision_score(y_true, y_scores)
-if __name__ == "__main__":
-    args = parse_args()
-    USE_CUSTOM_NEG = args.custom_neg
-    USE_BERT_EMBED = args.bert_embed
-    # --- Load dataset + frozen embeddings ---
-    if USE_CUSTOM_NEG:
-        print("using hard negatives")
-        dataset = OGBNLinkPredNegDataset(val_size=0.1, test_size=0.2)
-    else:
-        print("using random negatives")
-        dataset = OGBNLinkPredDataset(val_size=0.1, test_size=0.2)
-    if USE_BERT_EMBED:
-        print("using BERT embeds")
-        if Path("model/embeddings.pth").exists():
-            emb = torch.load("model/embeddings.pth", map_location=DEVICE)
-        else:
-            st = SentenceTransformer("bongsoo/kpf-sbert-128d-v1", device=DEVICE)
-            emb = st.encode(
-                dataset.corpus, convert_to_tensor=True, show_progress_bar=True
-            )
-            Path("model").mkdir(parents=True, exist_ok=True)
-            torch.save(emb, "model/embeddings.pth")
-        emb = emb.to(DEVICE)
-    else:
-        print("using skipgram embeds")
-        emb = dataset.data.x
-    train_data, val_data, test_data = dataset.get_splits()
-    model = PairMLP(emb.size(1) * 2).to(DEVICE)
-    opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
-    # --- Training ---
-    best_roc, best_ap = 0.0, 0.0
-    for epoch in range(NUM_EPOCHS):
-        loss = run_epoch(train_data, train=True)
-        val_roc, val_ap = evaluate(val_data)
-        if val_roc > best_roc:
-            torch.save(
-                model.state_dict(), f"model_roc{str(val_roc)[:4].replace('.', '_')}.pth"
-            )
-        print(
-            f"Epoch {epoch + 1} | Loss {loss:.4f} | Val ROC {val_roc:.4f} | Val AP {val_ap:.4f}"
-        )
-    # --- Final test ---
-    test_roc, test_ap = evaluate(test_data)
-    print(f"Test ROC {test_roc:.4f} | Test AP {test_ap:.4f}")

model/paper_similarity.py CHANGED Viewed

@@ -236,9 +236,9 @@ class PaperSimilarityFinder:
     def compare_methods(self, title: str, abstract: str, top_k: int = 5):
         """Compare TF-IDF vs sentence embeddings"""
-        if not hasattr(self, 'corpus_vectors'):
             self._setup_tfidf()
-        if not hasattr(self, 'corpus_embeddings'):
             self._setup_sentence_embeddings()
         query = f"{title}\n{abstract}"
@@ -246,10 +246,8 @@ class PaperSimilarityFinder:
         tfidf_results = self._find_similar_tfidf(query, top_k)
         sent_results = self._find_similar_sentence_transformer(query, top_k)
-        return {
-            'tfidf': tfidf_results,
-            'sentence_transformer': sent_results
-        }
 if __name__ == "__main__":
     dataset = OGBNLinkPredDataset()
@@ -265,28 +263,27 @@ if __name__ == "__main__":
         embeddings_cache_path=embeddings_dir,
     )
-    my_title = "Polynomial Implicit Neural Representations For Large Diverse Datasets"
     my_abstract = """
-        Implicit neural representations (INR) have gained significant popularity for signal and image representation for
-        many end-tasks, such as superresolution, 3D modeling, and
-        more. Most INR architectures rely on sinusoidal positional
-        encoding, which accounts for high-frequency information in
-        data. However, the finite encoding size restricts the model’s
-        representational power. Higher representational power is
-        needed to go from representing a single given image to representing large and diverse datasets. Our approach addresses
-        this gap by representing an image with a polynomial function
-        and eliminates the need for positional encodings. Therefore,
-        to achieve a progressively higher degree of polynomial representation, we use element-wise multiplications between
-        features and affine-transformed coordinate locations after
-        every ReLU layer. The proposed method is evaluated qualitatively and quantitatively on large datasets like ImageNet.
-        The proposed Poly-INR model performs comparably to stateof-the-art generative models without any convolution,
-        normalization, or self-attention layers, and with far fewer trainable parameters. With much fewer training parameters and
-        higher representative power, our approach paves the way
-        for broader adoption of INR models for generative modeling tasks in complex domains. The code is available at
-        https://github.com/Rajhans0/Poly_INR
     """
-    top_k = 5
     print(f"\nTop {top_k} Citation Predictions:\n")
     top_papers = similarity_finder.find_similar_papers(
@@ -311,5 +308,3 @@ if __name__ == "__main__":
     for idx, score, text in top_papers_cached:
         title = text.split("\n")[0].strip()
         print(f"Title: '{title}'")

     def compare_methods(self, title: str, abstract: str, top_k: int = 5):
         """Compare TF-IDF vs sentence embeddings"""
+        if not hasattr(self, "corpus_vectors"):
             self._setup_tfidf()
+        if not hasattr(self, "corpus_embeddings"):
             self._setup_sentence_embeddings()
         query = f"{title}\n{abstract}"
         tfidf_results = self._find_similar_tfidf(query, top_k)
         sent_results = self._find_similar_sentence_transformer(query, top_k)
+        return {"tfidf": tfidf_results, "sentence_transformer": sent_results}
 if __name__ == "__main__":
     dataset = OGBNLinkPredDataset()
         embeddings_cache_path=embeddings_dir,
     )
+    my_title = (
+        "PointNet: Deep Learning on Point Sets for 3D Classification and Segmentation"
+    )
     my_abstract = """
+        Point cloud is an important type of geometric data
+        structure. Due to its irregular format, most researchers
+        transform such data to regular 3D voxel grids or collections
+        of images. This, however, renders data unnecessarily
+        voluminous and causes issues. In this paper, we design a
+        novel type of neural network that directly consumes point
+        clouds, which well respects the permutation invariance of
+        points in the input. Our network, named PointNet, provides a unified architecture for applications ranging from
+        object classification, part segmentation, to scene semantic
+        parsing. Though simple, PointNet is highly efficient and
+        effective. Empirically, it shows strong performance on
+        par or even better than state of the art. Theoretically,
+        we provide analysis towards understanding of what the
+        network has learnt and why the network is r
     """
+    top_k = 10
     print(f"\nTop {top_k} Citation Predictions:\n")
     top_papers = similarity_finder.find_similar_papers(
     for idx, score, text in top_papers_cached:
         title = text.split("\n")[0].strip()
         print(f"Title: '{title}'")

model/simple_gcn_model.py DELETED Viewed

@@ -1,37 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch_geometric.nn import GCNConv
-class EdgeDecoder(torch.nn.Module):
-    """Predict citation existence of two node embeddings."""
-    def __init__(self, in_channels):
-        super().__init__()
-        self.linear = torch.nn.Linear(in_channels * 2, 1)
-    def forward(self, z, edge_index):
-        row, col = edge_index
-        # Concatenate the embeddings of the two nodes
-        z_cat = torch.cat([z[row], z[col]], dim=-1)
-        return self.linear(z_cat).squeeze(-1)
-class SimpleGCN(torch.nn.Module):
-    """Include encoder and decoder part. Encoder creates embedding for given node and decoder predict link existence between node embeddings."""
-    def __init__(self, in_channels, hidden_channels, out_channels):
-        super().__init__()
-        self.conv1 = GCNConv(in_channels, hidden_channels)
-        self.conv2 = GCNConv(hidden_channels, out_channels)
-        self.decoder = EdgeDecoder(out_channels)
-    def forward(self, x, edge_index):
-        x = self.conv1(x, edge_index).relu()
-        x = F.dropout(x, p=0.5, training=self.training)
-        z = self.conv2(x, edge_index)
-        return z
-    def decode(self, z, edge_label_index):
-        # We pass the edge_label_index to the decoder, which contains both pos and neg edges
-        return self.decoder(z, edge_label_index)

model/train.py DELETED Viewed

@@ -1,139 +0,0 @@
-import torch
-import numpy as np
-from torch_geometric.loader import LinkNeighborLoader
-from sklearn.metrics import roc_auc_score, accuracy_score
-from tqdm import tqdm
-from model.simple_gcn_model import SimpleGCN
-from dataset.ogbn_link_pred_dataset import OGBNLinkPredDataset
-BATCH_SIZE = 128
-NUM_EPOCHS = 20
-LR = 0.001
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# data
-dataset = OGBNLinkPredDataset(val_size=0.1, test_size=0.2)
-train_data, val_data, test_data = dataset.get_splits()
-train_loader = LinkNeighborLoader(
-    train_data,
-    num_neighbors=[-1, -1],  # Use all neighbors
-    neg_sampling_ratio=1.0,  # 1 negative sample per positive edge
-    edge_label_index=train_data.edge_label_index,
-    edge_label=train_data.edge_label,
-    batch_size=BATCH_SIZE,
-    shuffle=True,
-    num_workers=4,
-)
-val_loader = LinkNeighborLoader(
-    val_data,
-    num_neighbors=[-1, -1],
-    neg_sampling_ratio=0.0,  # RandomLinkSplit already added negative edges
-    edge_label_index=val_data.edge_label_index,
-    edge_label=val_data.edge_label,
-    batch_size=BATCH_SIZE,
-    shuffle=False,
-    num_workers=4,
-)
-test_loader = LinkNeighborLoader(
-    test_data,
-    num_neighbors=[-1, -1],
-    neg_sampling_ratio=0.0,
-    edge_label_index=test_data.edge_label_index,
-    edge_label=test_data.edge_label,
-    batch_size=BATCH_SIZE,
-    shuffle=False,
-    num_workers=4,
-)
-# model
-model = SimpleGCN(
-    in_channels=dataset.num_features,
-    hidden_channels=256,
-    out_channels=128,
-).to(DEVICE)
-optimizer = torch.optim.Adam(model.parameters(), lr=LR)
-criterion = torch.nn.BCEWithLogitsLoss()
-# training
-def train(train_loader, epoch):
-    model.train()
-    total_loss = 0
-    scaler = torch.GradScaler()
-    pbar = tqdm(train_loader, desc=f"Training Epoch: {epoch}")
-    for batch in pbar:
-        batch = batch.to(DEVICE)
-        optimizer.zero_grad()
-        with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
-            z = model(batch.x, batch.edge_index)
-            out = model.decode(z, batch.edge_label_index)
-            labels = batch.edge_label.float()
-            loss = criterion(out, labels)
-        scaler.scale(loss).backward()
-        scaler.step(optimizer)
-        scaler.update()
-        total_loss += loss.item()
-        pbar.set_postfix(loss=f"{loss.item():.4f}")
-    return total_loss / len(train_loader)
-@torch.no_grad()
-def calc_metrics(loader):
-    model.eval()
-    all_scores = []
-    all_labels = []
-    pbar = tqdm(loader, desc="Testing")
-    for batch in pbar:
-        batch = batch.to(DEVICE)
-        with torch.autocast(device_type=DEVICE.type, dtype=torch.bfloat16):
-            z = model(batch.x, batch.edge_index)
-            out = model.decode(z, batch.edge_label_index)
-        scores = torch.sigmoid(out).float().cpu().numpy()
-        labels = batch.edge_label.cpu().numpy()
-        all_scores.append(scores)
-        all_labels.append(labels)
-    all_scores = np.concatenate(all_scores)
-    all_labels = np.concatenate(all_labels)
-    return roc_auc_score(all_labels, all_scores), accuracy_score(
-        all_labels, all_scores > 0.5
-    )
-if __name__ == "__main__":
-    best_val_auc = 0
-    best_auc = 0
-    for epoch in range(1, NUM_EPOCHS + 1):
-        loss = train(train_loader, epoch)
-        val_auc, val_acc = calc_metrics(val_loader)
-        print(
-            f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, Val acc: {val_acc:.4f}",
-            end=" ",
-        )
-        if val_auc > best_val_auc:
-            print("New best")
-            best_val_auc = val_auc
-            best_auc = val_auc
-            torch.save(model.state_dict(), "model.pth")
-    test_auc, test_acc = calc_metrics(test_loader)
-    print("-" * 30)
-    print(f"Best validation AUC: {best_auc:.4f}")