Gleb Vinarskis
commited on
Commit
·
8473922
1
Parent(s):
b70da07
initial commit adding files
Browse files- LID-40-3-2000000-1-4.bin +3 -0
- README.md +40 -0
- __init__.py +0 -0
- config.json +21 -0
- configuration_lang.py +18 -0
- gitattributes +35 -0
- lang_detect.py +29 -0
- modeling_lang.py +50 -0
- test.py +13 -0
- test_floret.py +10 -0
LID-40-3-2000000-1-4.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:987a2e16b216eb22f0342beb75874e9748cf6bceeb4ac75f6e2efc3414e74961
|
3 |
+
size 32001553
|
README.md
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
- fr
|
6 |
+
- de
|
7 |
+
tags:
|
8 |
+
- v1.0.0
|
9 |
+
---
|
10 |
+
|
11 |
+
|
12 |
+
#### How to use
|
13 |
+
|
14 |
+
You can use this model with Transformers *pipeline* for NER.
|
15 |
+
|
16 |
+
<!-- Provide a longer summary of what this model is. -->
|
17 |
+
```python
|
18 |
+
MODEL_NAME = "emanuelaboros/lang-detect"
|
19 |
+
|
20 |
+
lang_pipeline = pipeline("lang-detect", model=MODEL_NAME,
|
21 |
+
trust_remote_code=True,
|
22 |
+
device='cpu')
|
23 |
+
|
24 |
+
sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
|
25 |
+
|
26 |
+
langs = lang_pipeline(sentence)
|
27 |
+
langs
|
28 |
+
|
29 |
+
```
|
30 |
+
|
31 |
+
```
|
32 |
+
{'label': 'fr', 'confidence': 99.87}
|
33 |
+
```
|
34 |
+
|
35 |
+
|
36 |
+
### BibTeX entry and citation info
|
37 |
+
|
38 |
+
```
|
39 |
+
{'label': 'fr', 'confidence': 99.87}
|
40 |
+
```
|
__init__.py
ADDED
File without changes
|
config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Maslionok/language-detect-pipeline",
|
3 |
+
"architectures": [
|
4 |
+
"LangDetectorModel"
|
5 |
+
],
|
6 |
+
"filename": "LID-40-3-2000000-1-4.bin",
|
7 |
+
"attention_probs_dropout_prob": 0.1,
|
8 |
+
"auto_map": {
|
9 |
+
"AutoConfig": "configuration_lang.ImpressoConfig",
|
10 |
+
"AutoModelForTokenClassification": "modeling_lang.LangDetectorModel"
|
11 |
+
},
|
12 |
+
"custom_pipelines": {
|
13 |
+
"lang-detect": {
|
14 |
+
"impl": "lang_detect.LangDetectionPipeline",
|
15 |
+
"pt": "AutoModelForTokenClassification"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"model_type": "lang_detect",
|
19 |
+
"torch_dtype": "float32",
|
20 |
+
"transformers_version": "4.49.0"
|
21 |
+
}
|
configuration_lang.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class ImpressoConfig(PretrainedConfig):
|
6 |
+
model_type = "lang_detect"
|
7 |
+
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
filename=None,
|
11 |
+
**kwargs,
|
12 |
+
):
|
13 |
+
super().__init__(**kwargs)
|
14 |
+
self.filename = filename
|
15 |
+
|
16 |
+
|
17 |
+
# Register the configuration with the transformers library
|
18 |
+
ImpressoConfig.register_for_auto_class()
|
gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
lang_detect.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Pipeline
|
2 |
+
|
3 |
+
|
4 |
+
class LangDetectionPipeline(Pipeline):
|
5 |
+
|
6 |
+
def _sanitize_parameters(self, **kwargs):
|
7 |
+
preprocess_kwargs = {}
|
8 |
+
if "text" in kwargs:
|
9 |
+
preprocess_kwargs["text"] = kwargs["text"]
|
10 |
+
return preprocess_kwargs, {}, {}
|
11 |
+
|
12 |
+
def preprocess(self, text, **kwargs):
|
13 |
+
# Nothing to preprocess
|
14 |
+
return text
|
15 |
+
|
16 |
+
def _forward(self, text, **kwargs):
|
17 |
+
predictions, probabilities = self.model(text)
|
18 |
+
return predictions, probabilities
|
19 |
+
|
20 |
+
def postprocess(self, outputs, **kwargs):
|
21 |
+
predictions, probabilities = outputs
|
22 |
+
label = predictions[0][0].replace("__label__", "") # Remove __label__ prefix
|
23 |
+
confidence = float(
|
24 |
+
probabilities[0][0]
|
25 |
+
) # Convert to float for JSON serialization
|
26 |
+
|
27 |
+
# Format as JSON-compatible dictionary
|
28 |
+
model_output = {"label": label, "confidence": round(confidence * 100, 2)}
|
29 |
+
return model_output
|
modeling_lang.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import PreTrainedModel
|
4 |
+
import logging
|
5 |
+
import floret
|
6 |
+
from .configuration_lang import ImpressoConfig
|
7 |
+
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
|
11 |
+
class LangDetectorModel(PreTrainedModel):
|
12 |
+
config_class = ImpressoConfig
|
13 |
+
|
14 |
+
def __init__(self, config):
|
15 |
+
super().__init__(config)
|
16 |
+
self.config = config
|
17 |
+
|
18 |
+
# Dummy for device checking
|
19 |
+
self.dummy_param = nn.Parameter(torch.zeros(1))
|
20 |
+
# Load floret model
|
21 |
+
self.model_floret = floret.load_model(self.config.config.filename)
|
22 |
+
|
23 |
+
#
|
24 |
+
def forward(self, input_ids, **kwargs):
|
25 |
+
if isinstance(input_ids, str):
|
26 |
+
# If the input is a single string, make it a list for floret
|
27 |
+
texts = [input_ids]
|
28 |
+
elif isinstance(input_ids, list) and all(isinstance(t, str) for t in input_ids):
|
29 |
+
texts = input_ids
|
30 |
+
else:
|
31 |
+
raise ValueError(f"Unexpected input type: {type(input_ids)}")
|
32 |
+
|
33 |
+
predictions, probabilities = self.model_floret.predict(texts, k=1)
|
34 |
+
return (
|
35 |
+
predictions,
|
36 |
+
probabilities,
|
37 |
+
)
|
38 |
+
|
39 |
+
@property
|
40 |
+
def device(self):
|
41 |
+
return next(self.parameters()).device
|
42 |
+
|
43 |
+
@classmethod
|
44 |
+
def from_pretrained(cls, *args, **kwargs):
|
45 |
+
# print("Ignoring weights and using custom initialization.")
|
46 |
+
# Manually create the config
|
47 |
+
config = ImpressoConfig(**kwargs)
|
48 |
+
# Pass the manually created config to the class
|
49 |
+
model = cls(config)
|
50 |
+
return model
|
test.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
MODEL_NAME = "emanuelaboros/lang-detect"
|
5 |
+
|
6 |
+
lang_pipeline = pipeline(
|
7 |
+
"lang-detect", model=MODEL_NAME, trust_remote_code=True, device="cpu"
|
8 |
+
)
|
9 |
+
|
10 |
+
sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
|
11 |
+
|
12 |
+
langs = lang_pipeline(sentence)
|
13 |
+
langs
|
test_floret.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import floret
|
2 |
+
|
3 |
+
model_floret = floret.load_model("LID-40-3-2000000-1-4.bin")
|
4 |
+
print(type(model_floret))
|
5 |
+
input_ids = 'this is a text'
|
6 |
+
|
7 |
+
input_ids = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
|
8 |
+
|
9 |
+
print(model_floret.predict([input_ids], k=1))
|
10 |
+
|