|
|
|
import torch |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
|
|
model_name = "SURIYA-KP/small-sentence-embeddings-fine-tuned-depression-symptoms" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
def get_embedding(text): |
|
|
|
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
token_embeddings = outputs.last_hidden_state |
|
attention_mask = inputs['attention_mask'] |
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
embedding = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
|
return embedding.numpy()[0] |
|
|
|
|
|
text1 = "I feel worthless and useless." |
|
text2 = "I am feeling happy and content today." |
|
|
|
emb1 = get_embedding(text1) |
|
emb2 = get_embedding(text2) |
|
|
|
|
|
cos_sim = torch.nn.functional.cosine_similarity( |
|
torch.tensor(emb1).unsqueeze(0), |
|
torch.tensor(emb2).unsqueeze(0) |
|
).item() |
|
|
|
print(f"Cosine similarity between texts: {cos_sim:.4f}") |
|
|