File size: 1,931 Bytes
0ca5d86 f33d3c4 0ca5d86 f33d3c4 0ca5d86 40b4d9e 21ec985 40b4d9e 21ec985 40b4d9e f33d3c4 0ca5d86 f33d3c4 0ca5d86 f33d3c4 0ca5d86 f33d3c4 0ca5d86 f33d3c4 0ca5d86 f33d3c4 21ec985 f33d3c4 0ca5d86 f33d3c4 0ca5d86 f33d3c4 0ca5d86 40b4d9e f33d3c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
---
library_name: transformers
tags: []
---
# Model Card for Model ID
This is an embedding model for clinical papers
## How to Use
### Simple finetuned model
```python
from transformers import AutoTokenizer, AutoModel
import torch
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
PATH = "josh-oo/aspect-based-embeddings-v3"
tokenizer = AutoTokenizer.from_pretrained(PATH)
tokenizer.model_max_length = 512
model = AutoModel.from_pretrained(PATH)
dummy_text = "This is a title of a medical paper"
dummy_input = tokenizer([dummy_text], return_tensors="pt", truncation=True)
dummy_input.to(DEVICE)
model.to(DEVICE)
with torch.no_grad():
output = model(**dummy_input)
embeddings = output.last_hidden_state[:, 0] #cls pooling
```
### Aspect guided model
```python
from transformers import AutoTokenizer, AutoModel
import torch
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
PATH = "josh-oo/aspect-based-embeddings-v3"
REVISION = "4f43387343acaacd9bfafec0a304c51ed140f078"
PREFIXES = ["<participants>","<intervention>","<condition>","<outcome>"]
tokenizer = AutoTokenizer.from_pretrained(PATH, revision=REVISION)
model = AutoModel.from_pretrained(PATH, revision=REVISION)
model.register_buffer("position_ids", torch.relu(torch.arange(model.config.max_position_embeddings + len(PREFIXES)).expand((1, -1)) - len(PREFIXES)), persistent=False)
model.register_buffer("token_type_ids", torch.zeros(model.position_ids.size(), dtype=torch.long), persistent=False) #set token type ids to 0
model.token_type_ids[:,1:1+len(PREFIXES)] = 1 #set prefix token type ids to 1
dummy_text = "".join(PREFIXES) + "This is a title of a medical paper"
dummy_input = tokenizer([dummy_text], return_tensors="pt", truncation=True)
dummy_input.pop('token_type_ids')
dummy_input.to(DEVICE)
model.to(DEVICE)
with torch.no_grad():
output = model(**dummy_input)
embeddings = output.last_hidden_state[:, 0] #cls pooling
``` |