Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +1 -0
README.md +76 -3
added_tokens.json +7 -0
architecture.png +3 -0
eurovoc.py +212 -0
handler.py +77 -0
mlb.pickle +3 -0
model.safetensors +3 -0
pytorch_model.bin +3 -0
requirements.txt +8 -0
special_tokens_map.json +63 -0
test_handler.py +25 -0
tokenizer.json +0 -0
tokenizer_config.json +110 -0
vocab.txt +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+architecture.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,76 @@
----
-license: apache-2.0
----

+---
+license: eupl-1.1
+datasets:
+- EuropeanParliament/cellar_eurovoc
+language:
+- en
+tags:
+- eurovoc
+pipeline_tag: text-classification
+widget:
+- text: "The Union condemns the continuing grave human rights violations by the Myanmar armed forces, including torture, sexual and gender-based violence, the persecution of civil society actors, human rights defenders and journalists, and attacks on the civilian population, including ethnic and religious minorities."
+---
+# Eurovoc Multilabel Classifer 🇪🇺
+[EuroVoc](https://op.europa.eu/fr/web/eu-vocabularies) is a large multidisciplinary multilingual (24 languages of 🇪🇺) hierarchical thesaurus of more than 7000 classes covering the activities of EU institutions.
+Given the number of legal documents produced every day and the huge mass of pre-existing documents to be classiﬁed high quality automated or semi-automated classiﬁcation methods are most welcome in this domain.
+This model based on BERT Deep Neural Network was trained on more than 3.9 million documents to achieve that task and is used in a production environment via the huggingface inference endpoint.
+This model support the 24 languages of the European Union.
+## Architecture
+![architecture](architecture.png)
+This classification model is built on top of [EUBERT](https://huggingface.co/EuropeanParliament/EUBERT) with 7331 Eurovoc labels
+With less than 100 million parameters, it can be deployed on commodity hardware without GPU acceleration (around 200 ms per inference for 2000 characters).
+Parameters :
+- Number of epochs 16
+- Batch size  10
+- Max lenght 512
+- Learning Rate 5e-05
+## Usage
+```python
+from eurovoc import EurovocTagger
+model = EurovocTagger.from_pretrained("EuropeanParliament/eurovoc_eu")
+```
+see the source code also
+### Payload example
+```json
+{
+  "inputs": "The Union condemns the continuing grave human rights violations by the Myanmar armed forces, including torture, sexual and gender-based violence, the persecution of civil society actors, human rights defenders and journalists, and attacks on the civilian population, including ethnic and religious minorities. ",
+  "topk": 10,
+  "threshold": 0.16
+}
+```
+result:
+```json
+{'results': [{'label': 'international sanctions', 'score': 0.9994925260543823},
+             {'label': 'economic sanctions', 'score': 0.9991770386695862},
+             {'label': 'natural person', 'score': 0.9591936469078064},
+             {'label': 'EU restrictive measure', 'score': 0.8388392329216003},
+             {'label': 'legal person', 'score': 0.45630475878715515},
+             {'label': 'Burma/Myanmar', 'score': 0.43375277519226074}]}
+```
+Only six results, because the following one score is less that 0.16
+Default value, topk = 5 and threshold = 0.16
+## Author(s)
+Andreas Papagiannis <[email protected]>

added_tokens.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "</s>": 65537,
+  "<mask>": 65540,
+  "<pad>": 65539,
+  "<s>": 65536,
+  "<unk>": 65538
+}

architecture.png ADDED Viewed

Git LFS Details

SHA256: b1913dd26f85243cbb6cec67f771627d6729aa453d77b09a469f64abd29f0913
Pointer size: 131 Bytes
Size of remote file: 352 kB

eurovoc.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import pytorch_lightning as pl
+import torch.nn as nn
+from transformers import BertTokenizerFast as BertTokenizer, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel
+from huggingface_hub import PyTorchModelHubMixin
+class EurovocDataset(Dataset):
+    def __init__(
+            self,
+            text: np.array,
+            labels: np.array,
+            tokenizer: BertTokenizer,
+            max_token_len: int = 128
+    ):
+        self.tokenizer = tokenizer
+        self.text = text
+        self.labels = labels
+        self.max_token_len = max_token_len
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, index: int):
+        text = self.text[index][0]
+        labels = self.labels[index]
+        encoding = self.tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=self.max_token_len,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+        return dict(
+            text=text,
+            input_ids=encoding["input_ids"].flatten(),
+            attention_mask=encoding["attention_mask"].flatten(),
+            labels=torch.FloatTensor(labels)
+        )
+class EuroVocLongTextDataset(Dataset):
+    def __splitter__(text, max_lenght):
+        l = text.split()
+        for i in range(0, len(l), max_lenght):
+            yield l[i:i + max_lenght]
+    def __init__(
+            self,
+            text: np.array,
+            labels: np.array,
+            tokenizer: BertTokenizer,
+            max_token_len: int = 128
+    ):
+        self.tokenizer = tokenizer
+        self.text = text
+        self.labels = labels
+        self.max_token_len = max_token_len
+        self.chunks_and_labels = [(c, l) for t, l in zip(self.text, self.labels) for c in self.__splitter__(t)]
+        self.encoding = self.tokenizer.batch_encode_plus(
+            [c for c, _ in self.chunks_and_labels],
+            add_special_tokens=True,
+            max_length=self.max_token_len,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+    def __len__(self):
+        return len(self.chunks_and_labels)
+    def __getitem__(self, index: int):
+        text, labels = self.chunks_and_labels[index]
+        return dict(
+            text=text,
+            input_ids=self.encoding[index]["input_ids"].flatten(),
+            attention_mask=self.encoding[index]["attention_mask"].flatten(),
+            labels=torch.FloatTensor(labels)
+        )
+class EurovocDataModule(pl.LightningDataModule):
+    def __init__(self, bert_model_name, x_tr, y_tr, x_test, y_test, batch_size=8, max_token_len=512):
+        super().__init__()
+        self.batch_size = batch_size
+        self.x_tr = x_tr
+        self.y_tr = y_tr
+        self.x_test = x_test
+        self.y_test = y_test
+        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+        self.max_token_len = max_token_len
+    def setup(self, stage=None):
+        self.train_dataset = EurovocDataset(
+            self.x_tr,
+            self.y_tr,
+            self.tokenizer,
+            self.max_token_len
+        )
+        self.test_dataset = EurovocDataset(
+            self.x_test,
+            self.y_test,
+            self.tokenizer,
+            self.max_token_len
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=2
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            num_workers=2
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            num_workers=2
+        )
+class EurovocTagger(pl.LightningModule, PyTorchModelHubMixin):
+  def __init__(self, bert_model_name, n_classes, lr=2e-5, eps=1e-8):
+    super().__init__()
+    self.bert = AutoModel.from_pretrained(bert_model_name)
+    self.dropout = nn.Dropout(p=0.2)
+    self.classifier1 = nn.Linear(self.bert.config.hidden_size, n_classes)
+    self.criterion = nn.BCELoss()
+    self.lr = lr
+    self.eps = eps
+  def forward(self, input_ids, attention_mask, labels=None):
+    output = self.bert(input_ids, attention_mask=attention_mask)
+    output = self.dropout(output.pooler_output)
+    output = self.classifier1(output)
+    output = torch.sigmoid(output)
+    loss = 0
+    if labels is not None:
+        loss = self.criterion(output, labels)
+    return loss, output
+  def training_step(self, batch, batch_idx):
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
+    labels = batch["labels"]
+    loss, outputs = self(input_ids, attention_mask, labels)
+    self.log("train_loss", loss, prog_bar=True, logger=True)
+    return {"loss": loss, "predictions": outputs, "labels": labels}
+  def validation_step(self, batch, batch_idx):
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
+    labels = batch["labels"]
+    loss, outputs = self(input_ids, attention_mask, labels)
+    self.log("val_loss", loss, prog_bar=True, logger=True)
+    return loss
+  def test_step(self, batch, batch_idx):
+    input_ids = batch["input_ids"]
+    attention_mask = batch["attention_mask"]
+    labels = batch["labels"]
+    loss, outputs = self(input_ids, attention_mask, labels)
+    self.log("test_loss", loss, prog_bar=True, logger=True)
+    return loss
+  def on_train_epoch_end(self,  *args, **kwargs):
+    return
+    #labels = []
+    #predictions = []
+    #for output in args['outputs']:
+    #  for out_labels in output["labels"].detach().cpu():
+    #    labels.append(out_labels)
+    #  for out_predictions in output["predictions"].detach().cpu():
+    #    predictions.append(out_predictions)
+    #labels = torch.stack(labels).int()
+    #predictions = torch.stack(predictions)
+    #for i, name in enumerate(mlb.classes_):
+    #  class_roc_auc = auroc(predictions[:, i], labels[:, i])
+    #  self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)
+  def configure_optimizers(self):
+        return torch.optim.AdamW(self.parameters(), lr=self.lr, eps=self.eps)

handler.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from typing import Dict, List, Any
+import numpy as np
+import pickle
+from sklearn.preprocessing import MultiLabelBinarizer
+from transformers import AutoTokenizer
+import torch
+from eurovoc import EurovocTagger
+BERT_MODEL_NAME = "EuropeanParliament/EUBERT"
+MAX_LEN = 512
+TEXT_MAX_LEN = MAX_LEN * 50
+tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
+class EndpointHandler:
+    mlb = MultiLabelBinarizer()
+    def __init__(self, path=""):
+        self.mlb = pickle.load(open(f"{path}/mlb.pickle", "rb"))
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = EurovocTagger.from_pretrained(path,
+                                                   bert_model_name=BERT_MODEL_NAME,
+                                                   n_classes=len(self.mlb.classes_),
+                                                   map_location=self.device)
+        self.model.eval()
+        self.model.freeze()
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+       data args:
+            inputs (:obj: `str` | `PIL.Image` | `np.array`)
+            kwargs
+      Return:
+            A :obj:`list` | `dict`: will be serialized and returned
+        """
+        text = data.pop("inputs", data)
+        topk = data.pop("topk", 5)
+        threshold = data.pop("threshold", 0.16)
+        debug = data.pop("debug", False)
+        prediction = self.get_prediction(text)
+        results = [{"label": label, "score": float(score)} for label, score in
+                   zip(self.mlb.classes_, prediction[0].tolist())]
+        results = sorted(results, key=lambda x: x["score"], reverse=True)
+        results = [r for r in results if r["score"] > threshold]
+        results = results[:topk]
+        if debug:
+            return {"results": results, "values": prediction, "input": text}
+        else:
+            return {"results": results}
+    def get_prediction(self, text):
+        # split text into chunks of MAX_LEN and get average prediction for each chunk
+        chunks = [text[i:i + MAX_LEN] for i in range(0, min(len(text), TEXT_MAX_LEN), MAX_LEN)]
+        predictions = [self._get_prediction(chunk) for chunk in chunks]
+        predictions = np.array(predictions).mean(axis=0)
+        return predictions
+    def _get_prediction(self, text):
+        item = tokenizer.encode_plus(
+            text,
+            add_special_tokens=True,
+            max_length=MAX_LEN,
+            return_token_type_ids=False,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt')
+        item.to(self.device)
+        _, prediction = self.model(item["input_ids"], item["attention_mask"])
+        prediction = prediction.cpu().detach().numpy()
+        print(text, prediction)
+        return prediction

mlb.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ef6c77d4be99dc73994099ea02207deca2449b7f4675464285fd41262146f49
+size 131

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1638381e5fafe20ad1f06b8662a69d369fefdb435f6aecf61bb6ef8e5ed1780
+size 134

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8d7dac5e88e6a751793812b04a026786e6f84c8ba2c20c9a1e3693ad8a5b65a
+size 134

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+nltk==3.8.1
+aiohttp==3.8.5
+ipython==8.14.0
+pip-chill==1.0.3
+pytorch-lightning==2.0.5
+scikit-learn==1.3.0
+transformers==4.38.2

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "additional_special_tokens": [
+    "[UNK]",
+    "[PAD]",
+    "[CLS]",
+    "[SEP]",
+    "[MASK]",
+    "<s>",
+    "</s>",
+    "<unk>",
+    "<pad>",
+    "<mask>"
+  ],
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_handler.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from pprint import pprint
+from handler import EndpointHandler
+# init handler
+my_handler = EndpointHandler(path=".")
+# prepare sample payload
+payload = {"inputs": "The Union condemns the continuing grave human rights violations by the Myanmar armed forces, including torture, sexual and gender-based violence, the persecution of civil society actors, human rights defenders and journalists, and attacks on the civilian population, including ethnic and religious minorities.",
+           "topk": 10,
+           "threshold": 0
+           }
+#payload = {"inputs": "EN Official Journal of the European Union LI 183/19 COUNCIL IMPLEMENTING REGULATION (EU) 2023/1497 of 20 July 2023 implementing Regulation (EU) No 401/2013 concerning restrictive measures in view of the situation in Myanmar/Burma THE COUNCIL OF THE EUROPEAN UNION, Having regard to the Treaty on the Functioning of the European Union, Having regard to Council Regulation (EU) No 401/2013 of 2 May 2013 concerning restrictive measures in view of the situation in Myanmar/Burma and repealing Regulation (EC) No 194/2008 (1), and in particular Article 4i thereof, Having regard to the proposal from the High Representative of the Union for Foreign Affairs and Security Policy, Whereas: (1) On 2 May 2013, the Council adopted Regulation (EU) No 401/2013. (2) On 31 January 2023, the High Representative of the Union for Foreign Affairs and Security Policy issued a declaration on behalf of the Union strongly condemning the overthrow of Myanmar’s democratically-elected government by the Myanmar armed forces in blatant violation of the will of the people as expressed in the general election of 8 November 2020. This illegitimate act reversed the country’s democratic transition and led to disastrous humanitarian, social, security, economic and human rights consequences. (3) The Union remains deeply concerned by the continuing escalation of violence and the evolution towards a protracted conflict with regional implications. The Union condemns the continuing grave human rights violations by the Myanmar armed forces, including torture, sexual and gender-based violence, the persecution of civil society actors, human rights defenders and journalists, and attacks on the civilian population, including ethnic and religious minorities. (4) In the absence of swift progress in the situation in Myanmar/Burma, the Union has expressed several times its readiness to adopt further restrictive measures against those responsible for undermining democracy and the rule of law and for the serious human rights violations taking place in that country. (5) In view of the continuing grave situation in Myanmar/Burma, six persons and one entity should be added to the list of natural and legal persons, entities and bodies subject to restrictive measures in Annex IV to Regulation (EU) No 401/2013. (6) Regulation (EU) No 401/2013 should therefore be amended accordingly, HAS ADOPTED THIS REGULATION: Article 1 Annex IV to Regulation (EU) No 401/2013 is amended as set out in the Annex to this Regulation. Article 2 This Regulation shall enter into force on the date of its publication in the Official Journal of the European Union. This Regulation shall be binding in its entirety and directly applicable in all Member States. Done at Brussels, 20 July 2023. For the Council The President J. BORRELL FONTELLES (1)  OJ L 121, 3. 5. 2013, p. 1. ANNEX Annex IV to Regulation (EU) No 401/2013 is amended as follows: (1) the following entries are added to the list headed ‘A. Natural persons referred to in Article 4a’:   Name Identifying information Reasons Date of listing ‘94. Aung Kyaw Min Nationality: Myanmar/Burma; Date of birth: circa 1958; Place of birth: Myanmar/Burma; Gender: male; Function: Member of State Administration Council Aung Kyaw Min has been a member of the State Administration Council (SAC) since 1 February 2023. He is also the former Chief-Minister of Rakhine State. SAC is led by Commander in Chief Min Aung Hlaing, who took over the legislative, executive and judicial powers of the State as of 1 February 2021, preventing the democratically-elected government from fulfilling its mandate. As member of the SAC, Aung Kyaw Min has been directly involved in and responsible for decision-making concerning state functions and is therefore responsible for undermining democracy and the rule of law in Myanmar/Burma. Additionally, the SAC has adopted decisions restricting the rights of freedom of expression, including access to information, and peaceful assembly. The military forces and authorities operating under the control of the SAC have committed serious human rights violations since 1 February 2021, killing civilian and unarmed protestors, and have restricted freedom of assembly and of expression. As a member of the SAC, Aung Kyaw Min is directly responsible for those repressive decisions and for serious human rights violations. 20. 7. 2023 95. Kyaw Swar Lin a. k. a Kyaw Swar Linn Nationality: Myanmar/Burma; Place of birth: Myanmar/Burma; Gender: male; Function: Quartermaster General of the Myanmar armed forces Lieutenant General Kyaw Swar Lin was been appointed as Quartermaster General in May 2020. It is the sixth highest position in the military of Myanmar/Burma. The Office of the Quartermaster General is a department under the jurisdiction of the Ministry of Defense and is involved in arms and military equipment procurement for the Myanmar Armed Forces. In addition, Kyaw Swar Lin runs the Myanmar Economic Corporation (MEC), which is one of the two major conglomerates and holding companies operated by the military, generating revenue for the Myanmar armed forces (Tatmadaw). As Quartermaster General, he forms part of the military regime which has seized power in a military coup and overthrown the legitimately elected leaders of Myanmar/Burma. Kyaw Swar Lin is therefore a natural person whose policies and activities undermine democracy and the rule of law in Myanmar/Burma, and who provides support for actions that threaten the peace, security and stability of Myanmar/Burma. 20. 7. 2023 96. Myint Kyaing a. k. a. U Myint Kyaing Nationality: Myanmar/Burma; Date of birth: 17. 4. 1957 Place of birth: Myanmar/Burma; Gender: male; Function: Union Minister of Immigration and Population Myint Kyaing has been the Union Minister for Immigration and Population since 19 August 2021. Before that, he was Union Minister of Labour following the coup of 1 February 2021. He is a member of the State Administration Council (SAC), led by Commander-in-Chief Min Aung Hlaing, which took over the legislative, executive and judicial powers of the State in a military coup on 1 February 2021. As a government Minister, he forms part of the military regime which has seized power in a military coup and overthrown the legitimately elected leaders of Myanmar/Burma. In his capacity as Union Minister, he carries out duties in support of military regime’s repressive immigration and population policy such as restrictions for citizens to travel within the country as well as the policy of the regime towards the minority of the Rohingya in violation of human rights. As Minister for Immigration and Population he also participates in preparations for the elections announced by the military in order to legitimise the illegal coup of February 2021. Myint Kyaing is therefore responsible for undermining democracy and the rule of law in Myanmar/Burma and for providing support for actions that threaten the peace, security and stability of Myanmar/Burma. 20. 7. 2023 97.",
+#           "topk": 10,
+#           "threshold": 0
+#           }
+# test the handler
+payload_pred = my_handler(payload)
+pprint(payload_pred)

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,110 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65536": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65537": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65538": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65539": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65540": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "[UNK]",
+    "[PAD]",
+    "[CLS]",
+    "[SEP]",
+    "[MASK]",
+    "<s>",
+    "</s>",
+    "<unk>",
+    "<pad>",
+    "<mask>"
+  ],
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "errors": "replace",
+  "mask_token": "[MASK]",
+  "max_len": 512,
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff