AngelPanizo commited on
Commit
dbe5ac2
·
verified ·
1 Parent(s): feedcca

Add BERTopic model

Browse files
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # MARTINI_enrich_BERTopic_zmoniuziniasklaida
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("AIDA-UPM/MARTINI_enrich_BERTopic_zmoniuziniasklaida")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 3
34
+ * Number of training documents: 131
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | -1 | profilaktikos - vakcina - ukrainos - mobilizacijos - uzkreciamuju | 20 | -1_profilaktikos_vakcina_ukrainos_mobilizacijos |
42
+ | 0 | vakcinacija - revakcinacijos - koronaviruso - testavimas - 152 | 82 | 0_vakcinacija_revakcinacijos_koronaviruso_testavimas |
43
+ | 1 | lietuviu - statuto - demokratu - investicijas - preliminariasias | 29 | 1_lietuviu_statuto_demokratu_investicijas |
44
+
45
+ </details>
46
+
47
+ ## Training hyperparameters
48
+
49
+ * calculate_probabilities: True
50
+ * language: None
51
+ * low_memory: False
52
+ * min_topic_size: 10
53
+ * n_gram_range: (1, 1)
54
+ * nr_topics: None
55
+ * seed_topic_list: None
56
+ * top_n_words: 10
57
+ * verbose: False
58
+ * zeroshot_min_similarity: 0.7
59
+ * zeroshot_topic_list: None
60
+
61
+ ## Framework versions
62
+
63
+ * Numpy: 1.26.4
64
+ * HDBSCAN: 0.8.40
65
+ * UMAP: 0.5.7
66
+ * Pandas: 2.2.3
67
+ * Scikit-Learn: 1.5.2
68
+ * Sentence-transformers: 3.3.1
69
+ * Transformers: 4.46.3
70
+ * Numba: 0.60.0
71
+ * Plotly: 5.24.1
72
+ * Python: 3.10.12
config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": true,
3
+ "language": null,
4
+ "low_memory": false,
5
+ "min_topic_size": 10,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 10,
13
+ "verbose": false,
14
+ "zeroshot_min_similarity": 0.7,
15
+ "zeroshot_topic_list": null
16
+ }
ctfidf.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12e686037a4b2aefbc4d286ea801466b98fce87b4bb0a336d298bfc602b34175
3
+ size 158396
ctfidf_config.json ADDED
The diff for this file is too large to render. See raw diff
 
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fc42e1d07861ea185b9885aac73e895ecf641f29a3cef7eb6096b9ab37db8c9
3
+ size 12376
topics.json ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "topic_representations": {
3
+ "-1": [
4
+ [
5
+ "profilaktikos",
6
+ 0.4668174386024475
7
+ ],
8
+ [
9
+ "vakcina",
10
+ 0.462668776512146
11
+ ],
12
+ [
13
+ "ukrainos",
14
+ 0.44660720229148865
15
+ ],
16
+ [
17
+ "mobilizacijos",
18
+ 0.44405344128608704
19
+ ],
20
+ [
21
+ "uzkreciamuju",
22
+ 0.4278428554534912
23
+ ]
24
+ ],
25
+ "0": [
26
+ [
27
+ "vakcinacija",
28
+ 0.5417367219924927
29
+ ],
30
+ [
31
+ "revakcinacijos",
32
+ 0.49907368421554565
33
+ ],
34
+ [
35
+ "koronaviruso",
36
+ 0.48520374298095703
37
+ ],
38
+ [
39
+ "testavimas",
40
+ 0.43476900458335876
41
+ ],
42
+ [
43
+ "152",
44
+ 0.41227248311042786
45
+ ]
46
+ ],
47
+ "1": [
48
+ [
49
+ "lietuviu",
50
+ 0.5181950330734253
51
+ ],
52
+ [
53
+ "statuto",
54
+ 0.44877418875694275
55
+ ],
56
+ [
57
+ "demokratu",
58
+ 0.4155261218547821
59
+ ],
60
+ [
61
+ "investicijas",
62
+ 0.4005277156829834
63
+ ],
64
+ [
65
+ "preliminariasias",
66
+ 0.39936381578445435
67
+ ]
68
+ ]
69
+ },
70
+ "topics": [
71
+ -1,
72
+ -1,
73
+ -1,
74
+ -1,
75
+ 1,
76
+ 1,
77
+ -1,
78
+ -1,
79
+ -1,
80
+ -1,
81
+ -1,
82
+ 0,
83
+ 0,
84
+ -1,
85
+ 0,
86
+ 0,
87
+ -1,
88
+ -1,
89
+ -1,
90
+ -1,
91
+ 1,
92
+ -1,
93
+ 1,
94
+ 1,
95
+ -1,
96
+ -1,
97
+ -1,
98
+ 0,
99
+ 1,
100
+ -1,
101
+ -1,
102
+ 1,
103
+ -1,
104
+ 0,
105
+ -1,
106
+ -1,
107
+ -1,
108
+ -1,
109
+ -1,
110
+ -1,
111
+ -1,
112
+ 1,
113
+ -1,
114
+ 1,
115
+ 0,
116
+ 0,
117
+ 0,
118
+ -1,
119
+ 0,
120
+ -1,
121
+ -1,
122
+ 0,
123
+ -1,
124
+ 1,
125
+ -1,
126
+ -1,
127
+ 0,
128
+ -1,
129
+ 1,
130
+ 1,
131
+ -1,
132
+ -1,
133
+ 0,
134
+ 1,
135
+ 1,
136
+ 0,
137
+ -1,
138
+ 0,
139
+ -1,
140
+ -1,
141
+ 0,
142
+ -1,
143
+ -1,
144
+ 0,
145
+ -1,
146
+ -1,
147
+ -1,
148
+ -1,
149
+ -1,
150
+ -1,
151
+ -1,
152
+ 0,
153
+ 0,
154
+ 0,
155
+ 0,
156
+ 0,
157
+ -1,
158
+ 1,
159
+ -1,
160
+ 0,
161
+ 0,
162
+ -1,
163
+ 0,
164
+ -1,
165
+ -1,
166
+ -1,
167
+ 1,
168
+ -1,
169
+ -1,
170
+ -1,
171
+ -1,
172
+ -1,
173
+ -1,
174
+ 1,
175
+ -1,
176
+ -1,
177
+ -1,
178
+ -1,
179
+ -1,
180
+ -1,
181
+ 0,
182
+ 1,
183
+ -1,
184
+ -1,
185
+ 0,
186
+ 0,
187
+ 1,
188
+ -1,
189
+ -1,
190
+ -1,
191
+ -1,
192
+ -1,
193
+ 0,
194
+ -1,
195
+ -1,
196
+ -1,
197
+ 1,
198
+ -1,
199
+ -1,
200
+ -1,
201
+ -1
202
+ ],
203
+ "topic_sizes": {
204
+ "-1": 82,
205
+ "1": 20,
206
+ "0": 29
207
+ },
208
+ "topic_mapper": [
209
+ [
210
+ -1,
211
+ -1,
212
+ -1
213
+ ],
214
+ [
215
+ 0,
216
+ 0,
217
+ 0
218
+ ],
219
+ [
220
+ 1,
221
+ 1,
222
+ 1
223
+ ]
224
+ ],
225
+ "topic_labels": {
226
+ "-1": "-1_profilaktikos_vakcina_ukrainos_mobilizacijos",
227
+ "0": "0_vakcinacija_revakcinacijos_koronaviruso_testavimas",
228
+ "1": "1_lietuviu_statuto_demokratu_investicijas"
229
+ },
230
+ "custom_labels": null,
231
+ "_outliers": 1,
232
+ "topic_aspects": {}
233
+ }