# ---------------- Imports ---------------- import torch import numpy as np import tensorflow as tf import tensorflow_hub as hub from transformers import BertTokenizer, BertModel import gradio as gr from sklearn.metrics.pairwise import cosine_similarity import pandas as pd import json import io # ---------------- Load models once ---------------- model_options = { "BERT Large Uncased": "bert-large-uncased", "BERT Large Cased": "bert-large-cased", "BERT Base Uncased": "bert-base-uncased", "BERT Base Cased": "bert-base-cased" } # Default model current_model_name = "bert-large-uncased" # Load ELMo (TF Hub) elmo = hub.KerasLayer("https://tfhub.dev/google/elmo/3", trainable=False) # Load BERT (HuggingFace Transformers) - will be reloaded when model changes tokenizer = BertTokenizer.from_pretrained(current_model_name) bert_model = BertModel.from_pretrained(current_model_name) bert_model.eval() # disable training mode # Global variables to store embeddings as matrices bert_embeddings_matrix = None elmo_embeddings_matrix = None sentences_storage = [] current_bert_model = None current_tokenizer = None def load_bert_model(model_name): """Load BERT model and tokenizer""" global current_bert_model, current_tokenizer try: current_tokenizer = BertTokenizer.from_pretrained(model_name) current_bert_model = BertModel.from_pretrained(model_name) current_bert_model.eval() return f"āœ… Loaded {model_name}" except Exception as e: return f"āŒ Error loading {model_name}: {str(e)}" # Initialize with default model load_bert_model(current_model_name) # ---------------- Single sentence embedding function ---------------- def get_single_embedding(sentence): """Get BERT and ELMo embeddings for a single sentence""" global current_bert_model, current_tokenizer # ------------ BERT ------------ # input_bert = current_tokenizer([sentence], return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): bert_output = current_bert_model(**input_bert) # [1, seq_len, hidden_size] token_embeddings = bert_output.last_hidden_state # tensor: (1, seq_len, 1024 for large) attention_mask = input_bert['attention_mask'].unsqueeze(-1) # (1, seq_len, 1) masked_embeddings = token_embeddings * attention_mask bert_embedding = masked_embeddings.sum(1) / attention_mask.sum(1) # mean pooling → (1, hidden_size) bert_embedding = bert_embedding.squeeze(0).numpy() # Remove batch dimension and convert to numpy # ------------ ELMo ------------ # input_elmo = tf.convert_to_tensor([sentence], dtype=tf.string) elmo_emb = elmo(input_elmo) # Default output is sentence-level embedding # ELMo typically returns a dictionary with different outputs, get the default embedding if isinstance(elmo_emb, dict): elmo_embedding = elmo_emb['default'] # or try 'elmo' key else: elmo_embedding = elmo_emb elmo_embedding = elmo_embedding.numpy().squeeze() # Convert to numpy and remove extra dimensions return bert_embedding, elmo_embedding def change_bert_model(model_choice): """Change BERT model and clear existing embeddings""" global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage model_name = model_options[model_choice] status = load_bert_model(model_name) # Clear existing embeddings since we changed the model bert_embeddings_matrix = None elmo_embeddings_matrix = None sentences_storage = [] clear_status = "šŸ”„ Model changed! Previous embeddings cleared. Please add sentences again." return status, clear_status, "šŸ“ No sentences added yet. Please add at least 2 sentences." # ---------------- Add sentence function ---------------- def add_sentence(sentence): """Add a sentence and compute its embeddings""" global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage if not sentence.strip(): return "Please enter a valid sentence.", get_current_status() sentence = sentence.strip() try: # Get embeddings for this sentence bert_emb, elmo_emb = get_single_embedding(sentence) # Add to matrices row by row if bert_embeddings_matrix is None: # First sentence - initialize matrices bert_embeddings_matrix = bert_emb.reshape(1, -1) # Make it 2D [1, features] elmo_embeddings_matrix = elmo_emb.reshape(1, -1) # Make it 2D [1, features] else: # Add as new row using vstack bert_embeddings_matrix = np.vstack([bert_embeddings_matrix, bert_emb.reshape(1, -1)]) elmo_embeddings_matrix = np.vstack([elmo_embeddings_matrix, elmo_emb.reshape(1, -1)]) # Store sentence sentences_storage.append(sentence) return f"āœ“ Added sentence {len(sentences_storage)}: '{sentence}'", get_current_status() except Exception as e: return f"āŒ Error processing sentence: {str(e)}", get_current_status() # ---------------- Get current status ---------------- def get_current_status(): """Return current status of stored sentences""" if len(sentences_storage) == 0: return "šŸ“ No sentences added yet. Please add at least 2 sentences." elif len(sentences_storage) == 1: return f"šŸ“ Current sentences ({len(sentences_storage)}/2 minimum):\n1: {sentences_storage[0]}\n\nāž• Add at least 1 more sentence to compute similarity." else: status = f"šŸ“ Current sentences ({len(sentences_storage)}):\n" for i, sent in enumerate(sentences_storage): status += f"{i+1}: {sent}\n" status += f"\nāœ… Ready to compute similarity!" return status # ---------------- Compute similarity ---------------- def compute_similarity(): """Compute similarity matrices for stored embeddings""" global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage if len(sentences_storage) < 2: return "āš ļø Please add at least 2 sentences before computing similarity." try: # Convert to torch tensors for torch.cosine_similarity bert_tensor = torch.tensor(bert_embeddings_matrix, dtype=torch.float32) elmo_tensor = torch.tensor(elmo_embeddings_matrix, dtype=torch.float32) # Compute pairwise cosine similarity using torch def torch_pairwise_cosine_similarity(X): # Normalize vectors X_norm = torch.nn.functional.normalize(X, p=2, dim=1) # Compute similarity matrix return torch.mm(X_norm, X_norm.t()) bert_sim_torch = torch_pairwise_cosine_similarity(bert_tensor) elmo_sim_torch = torch_pairwise_cosine_similarity(elmo_tensor) # Convert back to numpy for display bert_sim = bert_sim_torch.numpy() elmo_sim = elmo_sim_torch.numpy() # Alternative: Use sklearn for comparison bert_sim_sklearn = cosine_similarity(bert_embeddings_matrix) elmo_sim_sklearn = cosine_similarity(elmo_embeddings_matrix) # Format output result = f"šŸ” Similarity Analysis for {len(sentences_storage)} sentences:\n\n" result += "šŸ¤– BERT Similarity Matrix (PyTorch):\n" result += f"{np.round(bert_sim, 3)}\n\n" result += "🧠 ELMo Similarity Matrix (PyTorch):\n" result += f"{np.round(elmo_sim, 3)}\n\n" # Show comparison with sklearn (optional) result += "šŸ“Š Comparison Check:\n" result += f"BERT torch vs sklearn max diff: {np.max(np.abs(bert_sim - bert_sim_sklearn)):.6f}\n" result += f"ELMo torch vs sklearn max diff: {np.max(np.abs(elmo_sim - elmo_sim_sklearn)):.6f}\n\n" result += "šŸ“„ Sentences Reference:\n" for i, sentence in enumerate(sentences_storage): result += f"{i+1}: {sentence}\n" # Add matrix shapes info result += f"\nšŸ“Š Matrix Details:\n" result += f"BERT embeddings shape: {bert_embeddings_matrix.shape}\n" result += f"ELMo embeddings shape: {elmo_embeddings_matrix.shape}\n" result += f"Similarity matrices shape: {bert_sim.shape}" return result except Exception as e: return f"āŒ Error computing similarity: {str(e)}" def clear_all(): """Clear all stored sentences and embeddings""" global bert_embeddings_matrix, elmo_embeddings_matrix, sentences_storage bert_embeddings_matrix = None elmo_embeddings_matrix = None sentences_storage = [] return "šŸ—‘ļø All sentences cleared.", "šŸ“ No sentences added yet. Please add at least 2 sentences." # ---------------- Gradio Interface ---------------- with gr.Blocks(title="BERT + ELMo Sentence Similarity", theme=gr.themes.Soft()) as iface: gr.Markdown("# šŸ¤– BERT + ELMo Sentence Similarity Analyzer") gr.Markdown("Add sentences one by one (minimum 2) and compute pairwise similarity using BERT and ELMo embeddings.") # Model selection section with gr.Row(): with gr.Column(scale=1): model_dropdown = gr.Dropdown( choices=list(model_options.keys()), value="BERT Large Uncased", label="šŸ”§ Select BERT Model", info="Choose between cased/uncased and base/large variants" ) model_status = gr.Textbox( label="šŸ“‹ Model Status", value="āœ… Loaded bert-large-uncased", lines=1, interactive=False ) with gr.Row(): with gr.Column(scale=2): sentence_input = gr.Textbox( label="Enter a sentence", placeholder="Type your sentence here... (e.g., 'I love machine learning')", lines=2 ) with gr.Row(): add_btn = gr.Button("āž• Add Sentence", variant="primary", size="lg") compute_btn = gr.Button("šŸ” Compute Similarity", variant="secondary", size="lg") clear_btn = gr.Button("šŸ—‘ļø Clear All", variant="stop", size="lg") with gr.Column(scale=1): status_output = gr.Textbox( label="šŸ“‹ Current Status", value="šŸ“ No sentences added yet. Please add at least 2 sentences.", lines=8, interactive=False ) with gr.Row(): result_output = gr.Textbox( label="šŸ“Š Similarity Results", lines=20, interactive=False, show_copy_button=True ) gr.Markdown(""" ### šŸ“– How to use: 1. **Choose Model**: Select your preferred BERT variant (uncased recommended for similarity) 2. **Add sentences**: Type a sentence and click "Add Sentence" 3. **Repeat**: Add at least 2 sentences (you can add more!) 4. **Compute**: Click "Compute Similarity" to see the results 5. **Export**: Download embeddings and similarity matrices for further analysis 6. **Interpret**: Values closer to 1.0 indicate higher similarity ### šŸ”¬ Models: - **BERT Large Uncased**: Best for semantic similarity (recommended) - 1024 dimensions - **BERT Large Cased**: Preserves capitalization, good for proper nouns - 1024 dimensions - **BERT Base Uncased**: Faster, smaller model - 768 dimensions - **BERT Base Cased**: Cased version of base model - 768 dimensions - **ELMo**: Contextual word representations using LSTM - 1024 dimensions """) # Event handlers model_dropdown.change( fn=change_bert_model, inputs=[model_dropdown], outputs=[model_status, result_output, status_output] ) add_btn.click( fn=add_sentence, inputs=[sentence_input], outputs=[result_output, status_output] ).then( lambda: "", # Clear input after adding outputs=[sentence_input] ) compute_btn.click( fn=compute_similarity, outputs=[result_output] ) clear_btn.click( fn=clear_all, outputs=[result_output, status_output] ) if __name__ == "__main__": iface.launch(share=True)