Den-d3j2d commited on
Commit
542680b
·
verified ·
1 Parent(s): 9444a03

Upload 2 files

Browse files
Files changed (2) hide show
  1. src/config/config.py +13 -0
  2. src/model/encoder.py +44 -0
src/config/config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class ModelConfig:
6
+ bert_output_size = 312
7
+ embedding_size = 128
8
+
9
+
10
+ @dataclass
11
+ class TrainConfig:
12
+ epochs = 12
13
+ batch_size = 16
src/model/encoder.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from torch.nn import init
4
+ from transformers import AutoTokenizer, AutoModel
5
+
6
+
7
+ class ProdFeatureEncoder(nn.Module):
8
+ """
9
+ Model for creating embeddings with pre-trained ruBERT-tiny BERT.
10
+
11
+ Attributes:
12
+ config (object): Configuration object containing model hyperparameters.
13
+ tokenizer (AutoTokenizer): Tokenizer instance for ruBERT-tiny.
14
+ model (AutoModel): Pre-trained ruBERT-tiny model instance.
15
+ fc (nn.Linear): Linear layer for dimensionality reduction.
16
+ """
17
+ def __init__(self, config):
18
+ """
19
+ Initializes the ProdFeatureEncoder model.
20
+
21
+ Args:
22
+ config (object): Configuration object containing model hyperparameters.
23
+ """
24
+ super().__init__()
25
+ self.config = config
26
+ self.tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
27
+ self.model = AutoModel.from_pretrained("cointegrated/rubert-tiny")
28
+ self.fc = nn.Linear(self.config.bert_output_size, self.config.embedding_size)
29
+ init.xavier_uniform_(self.fc.weight)
30
+ self.norm = nn.LayerNorm(self.config.embedding_size)
31
+
32
+ def forward(self, text: str):
33
+ """
34
+ Creates an embedding for the input text.
35
+ Args:
36
+ text (str): Input text to create an embedding for.
37
+ Returns:
38
+ torch.Tensor: Embedding vector for the input text.
39
+ """
40
+ tokens = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
41
+ model_output = self.model(**{k: v.to(self.model.device) for k, v in tokens.items()})
42
+ embedding = model_output.last_hidden_state[:, 0, :]
43
+ embedding = self.fc(embedding)
44
+ return embedding[0]