Create README.md
Browse files
README.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- nb
|
| 4 |
+
- nn
|
| 5 |
+
- sv
|
| 6 |
+
- da
|
| 7 |
+
- 'no'
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
---
|
| 10 |
+
## Example usage
|
| 11 |
+
|
| 12 |
+
```commandline
|
| 13 |
+
git clone [email protected]:ltgoslo/slide.git
|
| 14 |
+
cd src/
|
| 15 |
+
python3 fast_usage_example.py
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## Cite us
|
| 19 |
+
@inproceedings{fedorova-etal-2025-multi,
|
| 20 |
+
title = "Multi-label {S}candinavian Language Identification ({SLIDE})",
|
| 21 |
+
author = "Fedorova, Mariia and
|
| 22 |
+
Frydenberg, Jonas Sebulon and
|
| 23 |
+
Handford, Victoria and
|
| 24 |
+
Lang{\o}, Victoria Ovedie Chruickshank and
|
| 25 |
+
Willoch, Solveig Helene and
|
| 26 |
+
Midtgaard, Marthe L{\o}ken and
|
| 27 |
+
Scherrer, Yves and
|
| 28 |
+
M{\ae}hlum, Petter and
|
| 29 |
+
Samuel, David",
|
| 30 |
+
editor = "Holdt, {\v{S}}pela Arhar and
|
| 31 |
+
Ilinykh, Nikolai and
|
| 32 |
+
Scalvini, Barbara and
|
| 33 |
+
Bruton, Micaella and
|
| 34 |
+
Debess, Iben Nyholm and
|
| 35 |
+
Tudor, Crina Madalina",
|
| 36 |
+
booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)",
|
| 37 |
+
month = mar,
|
| 38 |
+
year = "2025",
|
| 39 |
+
address = "Tallinn, Estonia",
|
| 40 |
+
publisher = "University of Tartu Library, Estonia",
|
| 41 |
+
url = "https://aclanthology.org/2025.resourceful-1.33/",
|
| 42 |
+
pages = "179--189",
|
| 43 |
+
ISBN = "978-9908-53-121-2",
|
| 44 |
+
abstract = "Identifying closely related languages at sentence level is difficult, in particular because it is often impossible to assign a sentence to a single language. In this paper, we focus on multi-label sentence-level Scandinavian language identification (LID) for Danish, Norwegian Bokm{\r{a}}l, Norwegian Nynorsk, and Swedish. We present the Scandinavian Language Identification and Evaluation, SLIDE, a manually curated multi-label evaluation dataset and a suite of LID models with varying speed{--}accuracy tradeoffs. We demonstrate that the ability to identify multiple languages simultaneously is necessary for any accurate LID method, and present a novel approach to training such multi-label LID models."
|
| 45 |
+
}
|