Spaces:
Runtime error
Runtime error
Create clustering.py
Browse files- clustering.py +926 -0
clustering.py
ADDED
@@ -0,0 +1,926 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
from collections import Counter
|
5 |
+
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
import plotly.express as px
|
10 |
+
import plotly.graph_objects as go
|
11 |
+
import umap
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
from googleapiclient.discovery import build
|
14 |
+
from plotly.subplots import make_subplots
|
15 |
+
from scipy.spatial.distance import cosine
|
16 |
+
from sentence_transformers import SentenceTransformer
|
17 |
+
from sklearn import set_config
|
18 |
+
from sklearn.cluster import AgglomerativeClustering
|
19 |
+
from sklearn.metrics import (
|
20 |
+
calinski_harabasz_score,
|
21 |
+
pairwise_distances,
|
22 |
+
silhouette_score,
|
23 |
+
)
|
24 |
+
from sklearn.neighbors import NearestNeighbors
|
25 |
+
from sklearn.preprocessing import normalize
|
26 |
+
from transformers import pipeline
|
27 |
+
from wordcloud import WordCloud
|
28 |
+
|
29 |
+
if os.getenv("RAILWAY_ENVIRONMENT") is None:
|
30 |
+
load_dotenv()
|
31 |
+
|
32 |
+
api_key = os.getenv("youtube_api_key")
|
33 |
+
|
34 |
+
RANDOM_STATE = 333
|
35 |
+
|
36 |
+
stopwords_es = [
|
37 |
+
"a",
|
38 |
+
"al",
|
39 |
+
"algo",
|
40 |
+
"algún",
|
41 |
+
"alguna",
|
42 |
+
"algunas",
|
43 |
+
"alguno",
|
44 |
+
"algunos",
|
45 |
+
"ante",
|
46 |
+
"antes",
|
47 |
+
"bajo",
|
48 |
+
"bastante",
|
49 |
+
"bien",
|
50 |
+
"cada",
|
51 |
+
"casi",
|
52 |
+
"como",
|
53 |
+
"con",
|
54 |
+
"cuanto",
|
55 |
+
"de",
|
56 |
+
"del",
|
57 |
+
"desde",
|
58 |
+
"donde",
|
59 |
+
"durante",
|
60 |
+
"el",
|
61 |
+
"ella",
|
62 |
+
"ellos",
|
63 |
+
"en",
|
64 |
+
"encima",
|
65 |
+
"ese",
|
66 |
+
"eso",
|
67 |
+
"esta",
|
68 |
+
"estas",
|
69 |
+
"este",
|
70 |
+
"estos",
|
71 |
+
"fuera",
|
72 |
+
"hay",
|
73 |
+
"la",
|
74 |
+
"las",
|
75 |
+
"le",
|
76 |
+
"lo",
|
77 |
+
"los",
|
78 |
+
"más",
|
79 |
+
"me",
|
80 |
+
"mi",
|
81 |
+
"mí",
|
82 |
+
"menos",
|
83 |
+
"mismo",
|
84 |
+
"mucho",
|
85 |
+
"muy",
|
86 |
+
"nada",
|
87 |
+
"ni",
|
88 |
+
"no",
|
89 |
+
"nos",
|
90 |
+
"nuestro",
|
91 |
+
"nuestra",
|
92 |
+
"o",
|
93 |
+
"os",
|
94 |
+
"para",
|
95 |
+
"pero",
|
96 |
+
"poco",
|
97 |
+
"por",
|
98 |
+
"que",
|
99 |
+
"quien",
|
100 |
+
"si",
|
101 |
+
"sólo",
|
102 |
+
"sobre",
|
103 |
+
"su",
|
104 |
+
"sus",
|
105 |
+
"te",
|
106 |
+
"tu",
|
107 |
+
"tus",
|
108 |
+
"un",
|
109 |
+
"una",
|
110 |
+
"unas",
|
111 |
+
"uno",
|
112 |
+
"unos",
|
113 |
+
"vos",
|
114 |
+
"ya",
|
115 |
+
"yo",
|
116 |
+
"además",
|
117 |
+
"alrededor",
|
118 |
+
"aún",
|
119 |
+
"bajo",
|
120 |
+
"bien",
|
121 |
+
"cada",
|
122 |
+
"cierta",
|
123 |
+
"ciertas",
|
124 |
+
"como",
|
125 |
+
"con",
|
126 |
+
"de",
|
127 |
+
"debe",
|
128 |
+
"dentro",
|
129 |
+
"dos",
|
130 |
+
"ella",
|
131 |
+
"en",
|
132 |
+
"entonces",
|
133 |
+
"entre",
|
134 |
+
"esa",
|
135 |
+
"esos",
|
136 |
+
"está",
|
137 |
+
"hasta",
|
138 |
+
"incluso",
|
139 |
+
"lejos",
|
140 |
+
"lo",
|
141 |
+
"luego",
|
142 |
+
"medio",
|
143 |
+
"mientras",
|
144 |
+
"muy",
|
145 |
+
"nunca",
|
146 |
+
"o",
|
147 |
+
"otro",
|
148 |
+
"para",
|
149 |
+
"pero",
|
150 |
+
"poco",
|
151 |
+
"por",
|
152 |
+
"se",
|
153 |
+
"si",
|
154 |
+
"sin",
|
155 |
+
"sobre",
|
156 |
+
"tan",
|
157 |
+
"te",
|
158 |
+
"ten",
|
159 |
+
"tendría",
|
160 |
+
"todos",
|
161 |
+
"total",
|
162 |
+
"un",
|
163 |
+
"una",
|
164 |
+
"uno",
|
165 |
+
"ustedes",
|
166 |
+
"yo",
|
167 |
+
"y",
|
168 |
+
"es",
|
169 |
+
"son",
|
170 |
+
"solo",
|
171 |
+
"les",
|
172 |
+
]
|
173 |
+
|
174 |
+
|
175 |
+
def normalize_text(text):
|
176 |
+
text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")
|
177 |
+
text = text.lower()
|
178 |
+
return text
|
179 |
+
|
180 |
+
|
181 |
+
def remove_stopwords(text, stopwords):
|
182 |
+
# Divide el texto en palabras y elimina las stopwords
|
183 |
+
return [word for word in text.split() if word not in stopwords]
|
184 |
+
|
185 |
+
|
186 |
+
def plot_wordcloud(data, text_column, output_filename=None):
|
187 |
+
text = " ".join(data[text_column])
|
188 |
+
|
189 |
+
stopwords_set = set(stopwords_es)
|
190 |
+
|
191 |
+
normalized_text = normalize_text(text)
|
192 |
+
cleaned_text = remove_stopwords(normalized_text, stopwords_set)
|
193 |
+
filtered_text = replace_html_entities(" ".join(cleaned_text))
|
194 |
+
|
195 |
+
# Crear la nube de palabras usando los conteos
|
196 |
+
wordcloud = WordCloud(
|
197 |
+
width=800, height=400, background_color="white", normalize_plurals=True
|
198 |
+
).generate(filtered_text)
|
199 |
+
|
200 |
+
# Mostrar la nube de palabras
|
201 |
+
plt.figure(figsize=(10, 5))
|
202 |
+
plt.imshow(wordcloud, interpolation="bilinear")
|
203 |
+
plt.axis("off")
|
204 |
+
|
205 |
+
if output_filename:
|
206 |
+
plt.savefig(output_filename, format="png")
|
207 |
+
plt.close()
|
208 |
+
return output_filename
|
209 |
+
|
210 |
+
|
211 |
+
def extract_video_id(url):
|
212 |
+
"""
|
213 |
+
Extrae el video_id de una URL de YouTube.
|
214 |
+
|
215 |
+
Parámetros:
|
216 |
+
- url: str, la URL del video de YouTube.
|
217 |
+
|
218 |
+
Retorna:
|
219 |
+
- video_id: str, el identificador del video de YouTube.
|
220 |
+
"""
|
221 |
+
# Expresión regular para encontrar el video_id en una URL de YouTube
|
222 |
+
pattern = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
|
223 |
+
match = re.search(pattern, url)
|
224 |
+
|
225 |
+
if match:
|
226 |
+
return match.group(1)
|
227 |
+
else:
|
228 |
+
raise ValueError("No se pudo encontrar un ID de video en la URL proporcionada.")
|
229 |
+
|
230 |
+
|
231 |
+
def get_youtube_video_details(url, api_key):
|
232 |
+
"""
|
233 |
+
Obtiene detalles de un video de YouTube usando la API de YouTube Data v3.
|
234 |
+
|
235 |
+
:param video_id: ID del video de YouTube.
|
236 |
+
:param api_key: Clave de API de YouTube Data v3.
|
237 |
+
:return: Un diccionario con el nombre del video, el canal, el número de vistas y el número de comentarios.
|
238 |
+
"""
|
239 |
+
try:
|
240 |
+
youtube = build("youtube", "v3", developerKey=api_key)
|
241 |
+
|
242 |
+
video_id = extract_video_id(url)
|
243 |
+
|
244 |
+
request = youtube.videos().list(part="snippet,statistics", id=video_id)
|
245 |
+
response = request.execute()
|
246 |
+
|
247 |
+
if "items" in response and len(response["items"]) > 0:
|
248 |
+
video = response["items"][0]
|
249 |
+
details = {
|
250 |
+
"title": video["snippet"]["title"],
|
251 |
+
"channel_title": video["snippet"]["channelTitle"],
|
252 |
+
"view_count": video["statistics"].get("viewCount", "No disponible"),
|
253 |
+
"comment_count": video["statistics"].get(
|
254 |
+
"commentCount", "No disponible"
|
255 |
+
),
|
256 |
+
}
|
257 |
+
return details
|
258 |
+
else:
|
259 |
+
return {"error": "No se encontró el video con el ID proporcionado."}
|
260 |
+
except Exception as e:
|
261 |
+
return {"error": str(e)}
|
262 |
+
|
263 |
+
|
264 |
+
def get_youtube_comments(api_key, url, max_results=100):
|
265 |
+
"""
|
266 |
+
Obtiene comentarios de un video de YouTube y los convierte en un DataFrame de pandas.
|
267 |
+
|
268 |
+
Parámetros:
|
269 |
+
- api_key: str, la clave de API de YouTube.
|
270 |
+
- video_id: str, el ID del video de YouTube.
|
271 |
+
- max_results: int, el número máximo de comentarios a obtener por solicitud (predeterminado es 100).
|
272 |
+
|
273 |
+
Retorna:
|
274 |
+
- df: pandas DataFrame, contiene los comentarios del video.
|
275 |
+
"""
|
276 |
+
|
277 |
+
# Crear el servicio de la API de YouTube
|
278 |
+
youtube = build("youtube", "v3", developerKey=api_key)
|
279 |
+
|
280 |
+
# Solicitar los comentarios del video
|
281 |
+
video_id = extract_video_id(url)
|
282 |
+
request = youtube.commentThreads().list(
|
283 |
+
part="snippet", videoId=video_id, maxResults=max_results
|
284 |
+
)
|
285 |
+
|
286 |
+
response = request.execute()
|
287 |
+
|
288 |
+
# Lista para almacenar los datos de los comentarios
|
289 |
+
comments_data = []
|
290 |
+
|
291 |
+
# Procesar y almacenar los comentarios en la lista
|
292 |
+
for item in response["items"]:
|
293 |
+
comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
|
294 |
+
author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
|
295 |
+
published_at = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
|
296 |
+
|
297 |
+
comments_data.append(
|
298 |
+
{"author": author, "comment": comment, "published_at": published_at}
|
299 |
+
)
|
300 |
+
|
301 |
+
# Paginar y obtener más comentarios si hay más disponibles
|
302 |
+
next_page_token = response.get("nextPageToken")
|
303 |
+
|
304 |
+
while next_page_token:
|
305 |
+
request = youtube.commentThreads().list(
|
306 |
+
part="snippet",
|
307 |
+
videoId=video_id,
|
308 |
+
pageToken=next_page_token,
|
309 |
+
maxResults=max_results,
|
310 |
+
)
|
311 |
+
response = request.execute()
|
312 |
+
|
313 |
+
for item in response["items"]:
|
314 |
+
comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
|
315 |
+
author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
|
316 |
+
published_at = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
|
317 |
+
|
318 |
+
comments_data.append(
|
319 |
+
{"author": author, "comment": comment, "published_at": published_at}
|
320 |
+
)
|
321 |
+
|
322 |
+
next_page_token = response.get("nextPageToken")
|
323 |
+
|
324 |
+
# Convertir la lista de comentarios en un DataFrame de pandas
|
325 |
+
df = pd.DataFrame(comments_data)
|
326 |
+
|
327 |
+
return df
|
328 |
+
|
329 |
+
|
330 |
+
def add_normalized_embeddings_to_dataframe(
|
331 |
+
data, text_column, model_name="paraphrase-multilingual-MiniLM-L12-v2"
|
332 |
+
):
|
333 |
+
"""
|
334 |
+
Genera y normaliza embeddings para una columna de texto en un DataFrame y agrega estos embeddings como nuevas columnas.
|
335 |
+
|
336 |
+
Parámetros:
|
337 |
+
- data: pandas DataFrame, el DataFrame que contiene la columna de texto.
|
338 |
+
- text_column: str, el nombre de la columna en el DataFrame que contiene el texto para generar embeddings.
|
339 |
+
- model_name: str, el nombre del modelo de SentenceTransformer a utilizar (por defecto "sentence-transformers/stsb-xlm-r-multilingual").
|
340 |
+
|
341 |
+
Retorna:
|
342 |
+
- data: pandas DataFrame, el DataFrame original con las nuevas columnas de embeddings normalizados.
|
343 |
+
"""
|
344 |
+
|
345 |
+
model = SentenceTransformer(model_name)
|
346 |
+
sentences = data[text_column].tolist()
|
347 |
+
embeddings = model.encode(sentences)
|
348 |
+
normalized_embeddings = normalize(embeddings, norm="l2")
|
349 |
+
|
350 |
+
data["embeddings"] = [embedding for embedding in normalized_embeddings]
|
351 |
+
|
352 |
+
return data
|
353 |
+
|
354 |
+
|
355 |
+
def plot_k_distance(data, threshold=0.01, quantile=0.95):
|
356 |
+
# embeddings_matrix = np.array(data["embeddings"].tolist())
|
357 |
+
embeddings_matrix = data.copy()
|
358 |
+
|
359 |
+
for threshold in [threshold, 0.05, 0.1, 0.2]:
|
360 |
+
min_samples = int(round(data.shape[0] * threshold, 0))
|
361 |
+
n_neighbors = min_samples - 1
|
362 |
+
|
363 |
+
if n_neighbors > 2:
|
364 |
+
nn = NearestNeighbors(
|
365 |
+
n_neighbors=n_neighbors, algorithm="auto", metric="cosine", n_jobs=-1
|
366 |
+
)
|
367 |
+
nn.fit(embeddings_matrix)
|
368 |
+
distances, _ = nn.kneighbors(embeddings_matrix)
|
369 |
+
k_distances = distances[:, -1]
|
370 |
+
min_eps = np.percentile(k_distances, quantile * 100)
|
371 |
+
k_distances = np.sort(k_distances)
|
372 |
+
fig = go.Figure()
|
373 |
+
fig.add_trace(go.Scatter(y=k_distances, mode="lines", name="k-distances"))
|
374 |
+
fig.add_hline(
|
375 |
+
y=min_eps,
|
376 |
+
line=dict(color="red", dash="dash"),
|
377 |
+
name=f"min_eps = {min_eps:.2f}",
|
378 |
+
)
|
379 |
+
fig.update_layout(
|
380 |
+
title="k-Distance Graph",
|
381 |
+
xaxis_title="Index",
|
382 |
+
yaxis_title="Distance",
|
383 |
+
width=800,
|
384 |
+
height=600,
|
385 |
+
template="plotly_dark",
|
386 |
+
)
|
387 |
+
return fig, min_eps
|
388 |
+
return None, None
|
389 |
+
|
390 |
+
|
391 |
+
def find_most_similar_comment(cluster_data, avg_embedding):
|
392 |
+
similarities = [
|
393 |
+
1 - cosine(avg_embedding, emb) for emb in cluster_data["embeddings"]
|
394 |
+
]
|
395 |
+
most_similar_index = np.argmax(similarities)
|
396 |
+
|
397 |
+
return cluster_data.iloc[most_similar_index]["comment"]
|
398 |
+
|
399 |
+
|
400 |
+
def format_text(text, line_length=50):
|
401 |
+
"""
|
402 |
+
Formatea el texto agregando saltos de línea cada 'line_length' caracteres.
|
403 |
+
|
404 |
+
:param text: El texto a formatear.
|
405 |
+
:param line_length: La longitud máxima de cada línea (por defecto 50 caracteres).
|
406 |
+
:return: El texto formateado con saltos de línea.
|
407 |
+
"""
|
408 |
+
# Divide el texto en partes de longitud 'line_length'
|
409 |
+
formatted_text = "<br>".join(
|
410 |
+
text[i : i + line_length] for i in range(0, len(text), line_length)
|
411 |
+
)
|
412 |
+
return formatted_text
|
413 |
+
|
414 |
+
|
415 |
+
def replace_html_entities(text):
|
416 |
+
"""
|
417 |
+
Reemplaza entidades HTML conocidas en el texto con sus caracteres correspondientes.
|
418 |
+
|
419 |
+
:param text: El texto con entidades HTML.
|
420 |
+
:return: El texto con las entidades reemplazadas.
|
421 |
+
"""
|
422 |
+
replacements = {
|
423 |
+
""": '"',
|
424 |
+
"&": "&",
|
425 |
+
"<": "<",
|
426 |
+
">": ">",
|
427 |
+
"<br>": "\n", # Reemplazar <br> con salto de línea
|
428 |
+
}
|
429 |
+
|
430 |
+
for entity, char in replacements.items():
|
431 |
+
text = text.replace(entity, char)
|
432 |
+
|
433 |
+
return text
|
434 |
+
|
435 |
+
|
436 |
+
def plot_sentiment_global(
|
437 |
+
data,
|
438 |
+
sentimiento_col="sentimiento",
|
439 |
+
title="Evolución de Comentarios por Sentimiento",
|
440 |
+
width=1200,
|
441 |
+
height=600,
|
442 |
+
):
|
443 |
+
""""""
|
444 |
+
df_global = data[sentimiento_col].value_counts().reset_index()
|
445 |
+
df_global.columns = [sentimiento_col, "count"]
|
446 |
+
|
447 |
+
fig_global = go.Figure()
|
448 |
+
|
449 |
+
color_palette = {"positivo": "#138d75", "negativo": "#a93226", "neutro": "#909497"}
|
450 |
+
|
451 |
+
for sentimiento in df_global[sentimiento_col].unique():
|
452 |
+
df_sentimiento = df_global[df_global[sentimiento_col] == sentimiento]
|
453 |
+
fig_global.add_trace(
|
454 |
+
go.Bar(
|
455 |
+
x=df_sentimiento[sentimiento_col],
|
456 |
+
y=df_sentimiento["count"],
|
457 |
+
text=df_sentimiento["count"],
|
458 |
+
textposition="inside",
|
459 |
+
insidetextanchor="middle",
|
460 |
+
name=sentimiento,
|
461 |
+
marker=dict(color=color_palette[sentimiento]),
|
462 |
+
)
|
463 |
+
)
|
464 |
+
|
465 |
+
fig_global.update_layout(
|
466 |
+
title=f"{title} - Global",
|
467 |
+
xaxis_title="Sentimiento",
|
468 |
+
yaxis_title="Número Total de Comentarios",
|
469 |
+
legend_title="Sentimiento",
|
470 |
+
template="plotly_dark",
|
471 |
+
width=width,
|
472 |
+
height=height,
|
473 |
+
)
|
474 |
+
|
475 |
+
return fig_global
|
476 |
+
|
477 |
+
|
478 |
+
def plot_sentiment_daily(
|
479 |
+
data,
|
480 |
+
fecha_col="published_at",
|
481 |
+
sentimiento_col="sentimiento",
|
482 |
+
title="Evolución de Comentarios por Sentimiento",
|
483 |
+
width=1200,
|
484 |
+
height=600,
|
485 |
+
):
|
486 |
+
""""""
|
487 |
+
data[fecha_col] = pd.to_datetime(data[fecha_col])
|
488 |
+
|
489 |
+
df_grouped = (
|
490 |
+
data.groupby([pd.Grouper(key=fecha_col, freq="D"), sentimiento_col])
|
491 |
+
.size()
|
492 |
+
.reset_index(name="count")
|
493 |
+
)
|
494 |
+
|
495 |
+
df_grouped["total_daily"] = df_grouped.groupby(pd.Grouper(key=fecha_col, freq="D"))[
|
496 |
+
"count"
|
497 |
+
].transform("sum")
|
498 |
+
df_grouped["percentage"] = df_grouped["count"] / df_grouped["total_daily"] * 100
|
499 |
+
|
500 |
+
fig_daily = go.Figure()
|
501 |
+
|
502 |
+
color_palette = {"positivo": "#138d75", "negativo": "#a93226", "neutro": "#909497"}
|
503 |
+
|
504 |
+
for sentimiento in data[sentimiento_col].unique():
|
505 |
+
df_sentimiento = df_grouped[df_grouped[sentimiento_col] == sentimiento]
|
506 |
+
fig_daily.add_trace(
|
507 |
+
go.Bar(
|
508 |
+
x=df_sentimiento[fecha_col],
|
509 |
+
y=df_sentimiento["total_daily"],
|
510 |
+
name=sentimiento,
|
511 |
+
text=df_sentimiento["count"],
|
512 |
+
texttemplate="%{text}",
|
513 |
+
textposition="inside",
|
514 |
+
insidetextanchor="middle",
|
515 |
+
customdata=df_sentimiento["percentage"],
|
516 |
+
hovertemplate="<b>Fecha</b>: %{x}<br><b>Sentimiento</b>: %{name}<br><b>Porcentaje</b>: %{customdata:.1f}%<br><b>Total de Comentarios</b>: %{text}<extra></extra>", # Información emergente con porcentaje y total
|
517 |
+
marker=dict(color=color_palette[sentimiento]),
|
518 |
+
)
|
519 |
+
)
|
520 |
+
|
521 |
+
fig_daily.update_layout(
|
522 |
+
title=f"{title} - Por Día",
|
523 |
+
xaxis_title="Fecha",
|
524 |
+
yaxis_title="Total de Comentarios",
|
525 |
+
legend_title="Sentimiento",
|
526 |
+
barmode="stack",
|
527 |
+
template="plotly_dark",
|
528 |
+
width=width,
|
529 |
+
height=height,
|
530 |
+
)
|
531 |
+
|
532 |
+
return fig_daily
|
533 |
+
|
534 |
+
|
535 |
+
def create_3d_umap_plot(data):
|
536 |
+
|
537 |
+
def calculate_sentiment_info(data):
|
538 |
+
cluster_sentiments = (
|
539 |
+
data.groupby("Cluster")["sentimiento"].value_counts().unstack(fill_value=0)
|
540 |
+
)
|
541 |
+
total_by_cluster = cluster_sentiments.sum(axis=1)
|
542 |
+
sentiment_percentages = (
|
543 |
+
cluster_sentiments.div(total_by_cluster, axis=0) * 100
|
544 |
+
).round(2)
|
545 |
+
|
546 |
+
sentiment_info = {}
|
547 |
+
for cluster in total_by_cluster.index:
|
548 |
+
info = [
|
549 |
+
f"{sentiment}: {count} ({percent}%)"
|
550 |
+
for sentiment, count, percent in zip(
|
551 |
+
cluster_sentiments.columns,
|
552 |
+
cluster_sentiments.loc[cluster],
|
553 |
+
sentiment_percentages.loc[cluster],
|
554 |
+
)
|
555 |
+
]
|
556 |
+
sentiment_info[cluster] = (
|
557 |
+
f"Total {total_by_cluster[cluster]}<br>" + "<br>".join(info)
|
558 |
+
)
|
559 |
+
|
560 |
+
return sentiment_info
|
561 |
+
|
562 |
+
fig = go.Figure()
|
563 |
+
|
564 |
+
fig.add_trace(
|
565 |
+
go.Scatter3d(
|
566 |
+
x=data["UMAP1"],
|
567 |
+
y=data["UMAP2"],
|
568 |
+
z=data["UMAP3"],
|
569 |
+
mode="markers",
|
570 |
+
marker=dict(
|
571 |
+
size=3,
|
572 |
+
color=data["Cluster"],
|
573 |
+
colorscale="Viridis",
|
574 |
+
colorbar=dict(title="Cluster"),
|
575 |
+
),
|
576 |
+
text=data["sentimiento"],
|
577 |
+
name="Puntos",
|
578 |
+
)
|
579 |
+
)
|
580 |
+
|
581 |
+
fig.update_layout(
|
582 |
+
scene=dict(xaxis_title="UMAP 1", yaxis_title="UMAP 2", zaxis_title="UMAP 3"),
|
583 |
+
template="plotly_dark",
|
584 |
+
title="Visualización 3D con UMAP y Clustering",
|
585 |
+
)
|
586 |
+
|
587 |
+
sentiment_info = calculate_sentiment_info(data)
|
588 |
+
|
589 |
+
hovertemplate = (
|
590 |
+
"Cluster: %{marker.color}<br>"
|
591 |
+
+ data["Cluster"].map(sentiment_info)
|
592 |
+
+ "<br>"
|
593 |
+
+ "<extra></extra>"
|
594 |
+
)
|
595 |
+
|
596 |
+
fig.update_traces(hovertemplate=hovertemplate)
|
597 |
+
|
598 |
+
fig.show()
|
599 |
+
|
600 |
+
|
601 |
+
def perform_clustering(data, min_eps, max_eps=0.95, n=5, embeddings_col="embeddings"):
|
602 |
+
|
603 |
+
embeddings_matrix = np.array(data[embeddings_col].tolist())
|
604 |
+
# threshold_values = np.round(np.linspace(min_eps, max_eps, n), 2)
|
605 |
+
threshold_values = np.linspace(min_eps, max_eps, n)
|
606 |
+
|
607 |
+
cluster_assignments = {}
|
608 |
+
cluster_counts = {}
|
609 |
+
calinski_harabasz_scores = {}
|
610 |
+
silhouette_scores = {}
|
611 |
+
most_similar_comments = {}
|
612 |
+
|
613 |
+
for distance_threshold in threshold_values:
|
614 |
+
clustering = AgglomerativeClustering(
|
615 |
+
n_clusters=None,
|
616 |
+
distance_threshold=distance_threshold,
|
617 |
+
linkage="complete",
|
618 |
+
metric="cosine",
|
619 |
+
)
|
620 |
+
data[f"cluster_{distance_threshold}"] = clustering.fit_predict(
|
621 |
+
embeddings_matrix
|
622 |
+
)
|
623 |
+
cluster_assignments[distance_threshold] = data[f"cluster_{distance_threshold}"]
|
624 |
+
cluster_counts[distance_threshold] = data[
|
625 |
+
f"cluster_{distance_threshold}"
|
626 |
+
].value_counts()
|
627 |
+
labels = data[f"cluster_{distance_threshold}"]
|
628 |
+
|
629 |
+
# Calcular Calinski-Harabasz Score
|
630 |
+
if len(np.unique(labels)) > 1:
|
631 |
+
# Recalcular matriz de distancias con base en los clusters
|
632 |
+
euclidean_distances = pairwise_distances(
|
633 |
+
embeddings_matrix, metric="euclidean"
|
634 |
+
)
|
635 |
+
ch_score = calinski_harabasz_score(euclidean_distances, labels)
|
636 |
+
else:
|
637 |
+
ch_score = -1 # Valor predeterminado si solo hay un clúster
|
638 |
+
calinski_harabasz_scores[distance_threshold] = ch_score
|
639 |
+
|
640 |
+
# Calcular Silhouette Score
|
641 |
+
if len(np.unique(labels)) > 1:
|
642 |
+
sil_score = silhouette_score(embeddings_matrix, labels, metric="cosine")
|
643 |
+
else:
|
644 |
+
sil_score = -1 # Valor predeterminado si solo hay un clúster
|
645 |
+
silhouette_scores[distance_threshold] = sil_score
|
646 |
+
|
647 |
+
# Placeholder for finding the most similar comment function
|
648 |
+
most_similar_comments[distance_threshold] = {}
|
649 |
+
for cluster_id in np.unique(labels):
|
650 |
+
cluster_data = data[data[f"cluster_{distance_threshold}"] == cluster_id]
|
651 |
+
avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
|
652 |
+
# Replace with your actual implementation
|
653 |
+
most_similar_comment = find_most_similar_comment(
|
654 |
+
cluster_data, avg_embedding
|
655 |
+
)
|
656 |
+
most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
|
657 |
+
|
658 |
+
return (
|
659 |
+
cluster_assignments,
|
660 |
+
cluster_counts,
|
661 |
+
calinski_harabasz_scores,
|
662 |
+
silhouette_scores,
|
663 |
+
most_similar_comments,
|
664 |
+
data,
|
665 |
+
)
|
666 |
+
|
667 |
+
|
668 |
+
def build_sankey_data(
|
669 |
+
cluster_assignments,
|
670 |
+
cluster_counts,
|
671 |
+
most_similar_comments,
|
672 |
+
min_items_by_cluster=10,
|
673 |
+
):
|
674 |
+
labels = []
|
675 |
+
source = []
|
676 |
+
target = []
|
677 |
+
values = []
|
678 |
+
comments = []
|
679 |
+
|
680 |
+
threshold_values = sorted(cluster_assignments.keys())
|
681 |
+
valid_clusters = {}
|
682 |
+
|
683 |
+
for threshold in threshold_values:
|
684 |
+
valid_clusters[threshold] = [
|
685 |
+
j
|
686 |
+
for j in np.unique(cluster_assignments[threshold])
|
687 |
+
if cluster_counts[threshold].get(j, 0) >= min_items_by_cluster
|
688 |
+
]
|
689 |
+
|
690 |
+
for i, threshold in enumerate(threshold_values):
|
691 |
+
for j in valid_clusters[threshold]:
|
692 |
+
cluster_name = (
|
693 |
+
f"{j} (d={threshold})\nTotal: {cluster_counts[threshold].get(j, 0)}"
|
694 |
+
)
|
695 |
+
if cluster_name not in labels:
|
696 |
+
labels.append(cluster_name)
|
697 |
+
comments.append(
|
698 |
+
format_text(
|
699 |
+
replace_html_entities(
|
700 |
+
most_similar_comments[threshold].get(j, "N/A")
|
701 |
+
)
|
702 |
+
)
|
703 |
+
)
|
704 |
+
|
705 |
+
if i > 0:
|
706 |
+
prev_threshold = threshold_values[i - 1]
|
707 |
+
for prev_cluster in valid_clusters[prev_threshold]:
|
708 |
+
for curr_cluster in valid_clusters[threshold]:
|
709 |
+
count = np.sum(
|
710 |
+
(cluster_assignments[prev_threshold] == prev_cluster)
|
711 |
+
& (cluster_assignments[threshold] == curr_cluster)
|
712 |
+
)
|
713 |
+
if count > 0:
|
714 |
+
source_idx = labels.index(
|
715 |
+
f"{prev_cluster} (d={prev_threshold})\nTotal: {cluster_counts[prev_threshold].get(prev_cluster, 0)}"
|
716 |
+
)
|
717 |
+
target_idx = labels.index(
|
718 |
+
f"{curr_cluster} (d={threshold})\nTotal: {cluster_counts[threshold].get(curr_cluster, 0)}"
|
719 |
+
)
|
720 |
+
source.append(source_idx)
|
721 |
+
target.append(target_idx)
|
722 |
+
values.append(count)
|
723 |
+
|
724 |
+
return (labels, source, target, values, comments)
|
725 |
+
|
726 |
+
|
727 |
+
def plot_sankey(labels, source, target, values, comments, width=None, height=None):
|
728 |
+
fig = go.Figure(
|
729 |
+
go.Sankey(
|
730 |
+
node=dict(
|
731 |
+
pad=15,
|
732 |
+
thickness=20,
|
733 |
+
line=dict(color="black", width=0),
|
734 |
+
label=labels,
|
735 |
+
hovertemplate="<b>%{label}</b><br>"
|
736 |
+
+ "<br><b>Commentario:</b><br>%{customdata}<extra></extra>",
|
737 |
+
customdata=comments,
|
738 |
+
),
|
739 |
+
link=dict(
|
740 |
+
source=source,
|
741 |
+
target=target,
|
742 |
+
value=values,
|
743 |
+
hovertemplate="<extra></extra>",
|
744 |
+
),
|
745 |
+
)
|
746 |
+
)
|
747 |
+
fig.update_layout(
|
748 |
+
title_text="Sankey Diagram of Agglomerative Clustering Transitions",
|
749 |
+
font_size=14,
|
750 |
+
width=width,
|
751 |
+
height=height,
|
752 |
+
template="plotly_dark",
|
753 |
+
)
|
754 |
+
|
755 |
+
return fig
|
756 |
+
|
757 |
+
|
758 |
+
def plot_clustering_metric(silhouette_scores, calinski_scores):
|
759 |
+
"""
|
760 |
+
Genera un gráfico que muestra los puntajes de silhouette y Calinski-Harabasz frente a los umbrales de distancia,
|
761 |
+
con dos ejes Y diferentes y marca el umbral con el mejor puntaje de silhouette.
|
762 |
+
|
763 |
+
Args:
|
764 |
+
silhouette_scores (dict): Un diccionario donde las claves son umbrales de distancia
|
765 |
+
y los valores son puntajes de silhouette correspondientes.
|
766 |
+
calinski_scores (dict): Un diccionario donde las claves son umbrales de distancia
|
767 |
+
y los valores son puntajes de Calinski-Harabasz correspondientes.
|
768 |
+
|
769 |
+
Returns:
|
770 |
+
fig (plotly.graph_objects.Figure): Un objeto Figure de Plotly con el gráfico generado.
|
771 |
+
"""
|
772 |
+
# Obtener los umbrales de distancia y puntajes
|
773 |
+
silhouette_thresholds = sorted(silhouette_scores.keys())
|
774 |
+
silhouette_metric_scores = [silhouette_scores[t] for t in silhouette_thresholds]
|
775 |
+
|
776 |
+
calinski_thresholds = sorted(calinski_scores.keys())
|
777 |
+
calinski_metric_scores = [calinski_scores[t] for t in calinski_thresholds]
|
778 |
+
|
779 |
+
# Determinar el mejor umbral basado en el puntaje más alto de silhouette
|
780 |
+
best_threshold = max(silhouette_scores, key=silhouette_scores.get)
|
781 |
+
|
782 |
+
# Crear el gráfico con dos ejes Y
|
783 |
+
fig = make_subplots(specs=[[{"secondary_y": True}]])
|
784 |
+
|
785 |
+
# Añadir la traza para el puntaje de silhouette
|
786 |
+
fig.add_trace(
|
787 |
+
go.Scatter(
|
788 |
+
x=silhouette_thresholds,
|
789 |
+
y=silhouette_metric_scores,
|
790 |
+
mode="lines+markers",
|
791 |
+
name="Silhouette Score",
|
792 |
+
marker=dict(color="red", size=10),
|
793 |
+
line=dict(color="red", width=2),
|
794 |
+
text=[
|
795 |
+
f"Threshold: {t}<br>Silhouette Score: {s}"
|
796 |
+
for t, s in zip(silhouette_thresholds, silhouette_metric_scores)
|
797 |
+
],
|
798 |
+
hoverinfo="text",
|
799 |
+
),
|
800 |
+
secondary_y=False, # Eje Y izquierdo
|
801 |
+
)
|
802 |
+
|
803 |
+
# Añadir la traza para el puntaje de Calinski-Harabasz
|
804 |
+
fig.add_trace(
|
805 |
+
go.Scatter(
|
806 |
+
x=calinski_thresholds,
|
807 |
+
y=calinski_metric_scores,
|
808 |
+
mode="lines+markers",
|
809 |
+
name="Calinski-Harabasz Score",
|
810 |
+
marker=dict(color="blue", size=10),
|
811 |
+
line=dict(color="blue", width=2),
|
812 |
+
text=[
|
813 |
+
f"Threshold: {t}<br>Calinski-Harabasz Score: {s}"
|
814 |
+
for t, s in zip(calinski_thresholds, calinski_metric_scores)
|
815 |
+
],
|
816 |
+
hoverinfo="text",
|
817 |
+
),
|
818 |
+
secondary_y=True, # Eje Y derecho
|
819 |
+
)
|
820 |
+
|
821 |
+
# Añadir una línea vertical para el mejor umbral
|
822 |
+
fig.add_vline(
|
823 |
+
x=best_threshold,
|
824 |
+
line=dict(color="green", width=2, dash="dash"),
|
825 |
+
annotation_text=f"Best Threshold: {best_threshold}",
|
826 |
+
annotation_position="top right",
|
827 |
+
)
|
828 |
+
|
829 |
+
# Configurar el diseño del gráfico
|
830 |
+
fig.update_layout(
|
831 |
+
title="Clustering Metrics vs. Threshold Distance",
|
832 |
+
xaxis_title="Threshold Distance",
|
833 |
+
yaxis_title="Silhouette Score",
|
834 |
+
yaxis2_title="Calinski-Harabasz Score",
|
835 |
+
font=dict(size=12),
|
836 |
+
width=800,
|
837 |
+
height=600,
|
838 |
+
template="plotly_dark",
|
839 |
+
)
|
840 |
+
|
841 |
+
return fig, best_threshold
|
842 |
+
|
843 |
+
|
844 |
+
classifier = pipeline(
|
845 |
+
"sentiment-analysis",
|
846 |
+
model="nlptown/bert-base-multilingual-uncased-sentiment",
|
847 |
+
truncation=True,
|
848 |
+
)
|
849 |
+
|
850 |
+
|
851 |
+
def map_sentiment(estrella):
|
852 |
+
if estrella in ["1 star", "2 stars"]:
|
853 |
+
return "negativo"
|
854 |
+
elif estrella == "3 stars":
|
855 |
+
return "neutro"
|
856 |
+
elif estrella in ["4 stars", "5 stars"]:
|
857 |
+
return "positivo"
|
858 |
+
|
859 |
+
|
860 |
+
def classify_sentiment(texto):
|
861 |
+
resultado = classifier(texto)[0]
|
862 |
+
sentimiento = map_sentiment(resultado["label"])
|
863 |
+
return (
|
864 |
+
sentimiento,
|
865 |
+
resultado["score"],
|
866 |
+
)
|
867 |
+
|
868 |
+
|
869 |
+
def classify_sentiment_df(data, comment_col="comment"):
|
870 |
+
|
871 |
+
def classify_sentiment(texto):
|
872 |
+
resultado = classifier(texto)[0]
|
873 |
+
sentimiento = map_sentiment(resultado["label"])
|
874 |
+
return sentimiento, resultado["score"]
|
875 |
+
|
876 |
+
data["sentimiento"], data["confianza"] = zip(
|
877 |
+
*data[comment_col].apply(classify_sentiment)
|
878 |
+
)
|
879 |
+
|
880 |
+
return data
|
881 |
+
|
882 |
+
|
883 |
+
def transform_embeddings(
|
884 |
+
data, embeddings_col="embeddings", n_components=3, random_seed=42
|
885 |
+
):
|
886 |
+
# Convertir embeddings a matriz numpy
|
887 |
+
embeddings_matrix = np.array(data[embeddings_col].tolist())
|
888 |
+
|
889 |
+
# Aplicar UMAP para reducción de dimensionalidad
|
890 |
+
umap_model = umap.UMAP(
|
891 |
+
n_components=n_components, random_state=random_seed, metric="cosine"
|
892 |
+
)
|
893 |
+
data_umap = umap_model.fit_transform(embeddings_matrix)
|
894 |
+
|
895 |
+
# Calcular distancias y percentiles para determinar min_eps y max_eps
|
896 |
+
distances = pairwise_distances(data_umap, metric="cosine")
|
897 |
+
min_eps = np.percentile(distances, 10)
|
898 |
+
max_eps = np.percentile(distances, 50)
|
899 |
+
|
900 |
+
umap_data = pd.DataFrame(
|
901 |
+
{"embeddings": [embedding.tolist() for embedding in data_umap]}
|
902 |
+
)
|
903 |
+
umap_data["comment"] = data["comment"]
|
904 |
+
|
905 |
+
return umap_data, min_eps, max_eps
|
906 |
+
|
907 |
+
|
908 |
+
def determine_min_items_by_cluster(total):
|
909 |
+
""" """
|
910 |
+
if total < 50:
|
911 |
+
min_items_by_cluster = 1
|
912 |
+
elif total < 100:
|
913 |
+
min_items_by_cluster = 5
|
914 |
+
elif total < 500:
|
915 |
+
min_items_by_cluster = 10
|
916 |
+
else:
|
917 |
+
min_items_by_cluster = int(round(total * 0.01, 2))
|
918 |
+
|
919 |
+
return min_items_by_cluster
|
920 |
+
|
921 |
+
|
922 |
+
def main(): ...
|
923 |
+
|
924 |
+
|
925 |
+
if __name__ == "__main__":
|
926 |
+
main()
|