Kuautli commited on
Commit
ec5679e
·
verified ·
1 Parent(s): b1cb914

Create clustering.py

Browse files
Files changed (1) hide show
  1. clustering.py +926 -0
clustering.py ADDED
@@ -0,0 +1,926 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import unicodedata
4
+ from collections import Counter
5
+
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ import umap
12
+ from dotenv import load_dotenv
13
+ from googleapiclient.discovery import build
14
+ from plotly.subplots import make_subplots
15
+ from scipy.spatial.distance import cosine
16
+ from sentence_transformers import SentenceTransformer
17
+ from sklearn import set_config
18
+ from sklearn.cluster import AgglomerativeClustering
19
+ from sklearn.metrics import (
20
+ calinski_harabasz_score,
21
+ pairwise_distances,
22
+ silhouette_score,
23
+ )
24
+ from sklearn.neighbors import NearestNeighbors
25
+ from sklearn.preprocessing import normalize
26
+ from transformers import pipeline
27
+ from wordcloud import WordCloud
28
+
29
+ if os.getenv("RAILWAY_ENVIRONMENT") is None:
30
+ load_dotenv()
31
+
32
+ api_key = os.getenv("youtube_api_key")
33
+
34
+ RANDOM_STATE = 333
35
+
36
+ stopwords_es = [
37
+ "a",
38
+ "al",
39
+ "algo",
40
+ "algún",
41
+ "alguna",
42
+ "algunas",
43
+ "alguno",
44
+ "algunos",
45
+ "ante",
46
+ "antes",
47
+ "bajo",
48
+ "bastante",
49
+ "bien",
50
+ "cada",
51
+ "casi",
52
+ "como",
53
+ "con",
54
+ "cuanto",
55
+ "de",
56
+ "del",
57
+ "desde",
58
+ "donde",
59
+ "durante",
60
+ "el",
61
+ "ella",
62
+ "ellos",
63
+ "en",
64
+ "encima",
65
+ "ese",
66
+ "eso",
67
+ "esta",
68
+ "estas",
69
+ "este",
70
+ "estos",
71
+ "fuera",
72
+ "hay",
73
+ "la",
74
+ "las",
75
+ "le",
76
+ "lo",
77
+ "los",
78
+ "más",
79
+ "me",
80
+ "mi",
81
+ "mí",
82
+ "menos",
83
+ "mismo",
84
+ "mucho",
85
+ "muy",
86
+ "nada",
87
+ "ni",
88
+ "no",
89
+ "nos",
90
+ "nuestro",
91
+ "nuestra",
92
+ "o",
93
+ "os",
94
+ "para",
95
+ "pero",
96
+ "poco",
97
+ "por",
98
+ "que",
99
+ "quien",
100
+ "si",
101
+ "sólo",
102
+ "sobre",
103
+ "su",
104
+ "sus",
105
+ "te",
106
+ "tu",
107
+ "tus",
108
+ "un",
109
+ "una",
110
+ "unas",
111
+ "uno",
112
+ "unos",
113
+ "vos",
114
+ "ya",
115
+ "yo",
116
+ "además",
117
+ "alrededor",
118
+ "aún",
119
+ "bajo",
120
+ "bien",
121
+ "cada",
122
+ "cierta",
123
+ "ciertas",
124
+ "como",
125
+ "con",
126
+ "de",
127
+ "debe",
128
+ "dentro",
129
+ "dos",
130
+ "ella",
131
+ "en",
132
+ "entonces",
133
+ "entre",
134
+ "esa",
135
+ "esos",
136
+ "está",
137
+ "hasta",
138
+ "incluso",
139
+ "lejos",
140
+ "lo",
141
+ "luego",
142
+ "medio",
143
+ "mientras",
144
+ "muy",
145
+ "nunca",
146
+ "o",
147
+ "otro",
148
+ "para",
149
+ "pero",
150
+ "poco",
151
+ "por",
152
+ "se",
153
+ "si",
154
+ "sin",
155
+ "sobre",
156
+ "tan",
157
+ "te",
158
+ "ten",
159
+ "tendría",
160
+ "todos",
161
+ "total",
162
+ "un",
163
+ "una",
164
+ "uno",
165
+ "ustedes",
166
+ "yo",
167
+ "y",
168
+ "es",
169
+ "son",
170
+ "solo",
171
+ "les",
172
+ ]
173
+
174
+
175
+ def normalize_text(text):
176
+ text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")
177
+ text = text.lower()
178
+ return text
179
+
180
+
181
+ def remove_stopwords(text, stopwords):
182
+ # Divide el texto en palabras y elimina las stopwords
183
+ return [word for word in text.split() if word not in stopwords]
184
+
185
+
186
+ def plot_wordcloud(data, text_column, output_filename=None):
187
+ text = " ".join(data[text_column])
188
+
189
+ stopwords_set = set(stopwords_es)
190
+
191
+ normalized_text = normalize_text(text)
192
+ cleaned_text = remove_stopwords(normalized_text, stopwords_set)
193
+ filtered_text = replace_html_entities(" ".join(cleaned_text))
194
+
195
+ # Crear la nube de palabras usando los conteos
196
+ wordcloud = WordCloud(
197
+ width=800, height=400, background_color="white", normalize_plurals=True
198
+ ).generate(filtered_text)
199
+
200
+ # Mostrar la nube de palabras
201
+ plt.figure(figsize=(10, 5))
202
+ plt.imshow(wordcloud, interpolation="bilinear")
203
+ plt.axis("off")
204
+
205
+ if output_filename:
206
+ plt.savefig(output_filename, format="png")
207
+ plt.close()
208
+ return output_filename
209
+
210
+
211
+ def extract_video_id(url):
212
+ """
213
+ Extrae el video_id de una URL de YouTube.
214
+
215
+ Parámetros:
216
+ - url: str, la URL del video de YouTube.
217
+
218
+ Retorna:
219
+ - video_id: str, el identificador del video de YouTube.
220
+ """
221
+ # Expresión regular para encontrar el video_id en una URL de YouTube
222
+ pattern = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
223
+ match = re.search(pattern, url)
224
+
225
+ if match:
226
+ return match.group(1)
227
+ else:
228
+ raise ValueError("No se pudo encontrar un ID de video en la URL proporcionada.")
229
+
230
+
231
+ def get_youtube_video_details(url, api_key):
232
+ """
233
+ Obtiene detalles de un video de YouTube usando la API de YouTube Data v3.
234
+
235
+ :param video_id: ID del video de YouTube.
236
+ :param api_key: Clave de API de YouTube Data v3.
237
+ :return: Un diccionario con el nombre del video, el canal, el número de vistas y el número de comentarios.
238
+ """
239
+ try:
240
+ youtube = build("youtube", "v3", developerKey=api_key)
241
+
242
+ video_id = extract_video_id(url)
243
+
244
+ request = youtube.videos().list(part="snippet,statistics", id=video_id)
245
+ response = request.execute()
246
+
247
+ if "items" in response and len(response["items"]) > 0:
248
+ video = response["items"][0]
249
+ details = {
250
+ "title": video["snippet"]["title"],
251
+ "channel_title": video["snippet"]["channelTitle"],
252
+ "view_count": video["statistics"].get("viewCount", "No disponible"),
253
+ "comment_count": video["statistics"].get(
254
+ "commentCount", "No disponible"
255
+ ),
256
+ }
257
+ return details
258
+ else:
259
+ return {"error": "No se encontró el video con el ID proporcionado."}
260
+ except Exception as e:
261
+ return {"error": str(e)}
262
+
263
+
264
+ def get_youtube_comments(api_key, url, max_results=100):
265
+ """
266
+ Obtiene comentarios de un video de YouTube y los convierte en un DataFrame de pandas.
267
+
268
+ Parámetros:
269
+ - api_key: str, la clave de API de YouTube.
270
+ - video_id: str, el ID del video de YouTube.
271
+ - max_results: int, el número máximo de comentarios a obtener por solicitud (predeterminado es 100).
272
+
273
+ Retorna:
274
+ - df: pandas DataFrame, contiene los comentarios del video.
275
+ """
276
+
277
+ # Crear el servicio de la API de YouTube
278
+ youtube = build("youtube", "v3", developerKey=api_key)
279
+
280
+ # Solicitar los comentarios del video
281
+ video_id = extract_video_id(url)
282
+ request = youtube.commentThreads().list(
283
+ part="snippet", videoId=video_id, maxResults=max_results
284
+ )
285
+
286
+ response = request.execute()
287
+
288
+ # Lista para almacenar los datos de los comentarios
289
+ comments_data = []
290
+
291
+ # Procesar y almacenar los comentarios en la lista
292
+ for item in response["items"]:
293
+ comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
294
+ author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
295
+ published_at = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
296
+
297
+ comments_data.append(
298
+ {"author": author, "comment": comment, "published_at": published_at}
299
+ )
300
+
301
+ # Paginar y obtener más comentarios si hay más disponibles
302
+ next_page_token = response.get("nextPageToken")
303
+
304
+ while next_page_token:
305
+ request = youtube.commentThreads().list(
306
+ part="snippet",
307
+ videoId=video_id,
308
+ pageToken=next_page_token,
309
+ maxResults=max_results,
310
+ )
311
+ response = request.execute()
312
+
313
+ for item in response["items"]:
314
+ comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
315
+ author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
316
+ published_at = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
317
+
318
+ comments_data.append(
319
+ {"author": author, "comment": comment, "published_at": published_at}
320
+ )
321
+
322
+ next_page_token = response.get("nextPageToken")
323
+
324
+ # Convertir la lista de comentarios en un DataFrame de pandas
325
+ df = pd.DataFrame(comments_data)
326
+
327
+ return df
328
+
329
+
330
+ def add_normalized_embeddings_to_dataframe(
331
+ data, text_column, model_name="paraphrase-multilingual-MiniLM-L12-v2"
332
+ ):
333
+ """
334
+ Genera y normaliza embeddings para una columna de texto en un DataFrame y agrega estos embeddings como nuevas columnas.
335
+
336
+ Parámetros:
337
+ - data: pandas DataFrame, el DataFrame que contiene la columna de texto.
338
+ - text_column: str, el nombre de la columna en el DataFrame que contiene el texto para generar embeddings.
339
+ - model_name: str, el nombre del modelo de SentenceTransformer a utilizar (por defecto "sentence-transformers/stsb-xlm-r-multilingual").
340
+
341
+ Retorna:
342
+ - data: pandas DataFrame, el DataFrame original con las nuevas columnas de embeddings normalizados.
343
+ """
344
+
345
+ model = SentenceTransformer(model_name)
346
+ sentences = data[text_column].tolist()
347
+ embeddings = model.encode(sentences)
348
+ normalized_embeddings = normalize(embeddings, norm="l2")
349
+
350
+ data["embeddings"] = [embedding for embedding in normalized_embeddings]
351
+
352
+ return data
353
+
354
+
355
+ def plot_k_distance(data, threshold=0.01, quantile=0.95):
356
+ # embeddings_matrix = np.array(data["embeddings"].tolist())
357
+ embeddings_matrix = data.copy()
358
+
359
+ for threshold in [threshold, 0.05, 0.1, 0.2]:
360
+ min_samples = int(round(data.shape[0] * threshold, 0))
361
+ n_neighbors = min_samples - 1
362
+
363
+ if n_neighbors > 2:
364
+ nn = NearestNeighbors(
365
+ n_neighbors=n_neighbors, algorithm="auto", metric="cosine", n_jobs=-1
366
+ )
367
+ nn.fit(embeddings_matrix)
368
+ distances, _ = nn.kneighbors(embeddings_matrix)
369
+ k_distances = distances[:, -1]
370
+ min_eps = np.percentile(k_distances, quantile * 100)
371
+ k_distances = np.sort(k_distances)
372
+ fig = go.Figure()
373
+ fig.add_trace(go.Scatter(y=k_distances, mode="lines", name="k-distances"))
374
+ fig.add_hline(
375
+ y=min_eps,
376
+ line=dict(color="red", dash="dash"),
377
+ name=f"min_eps = {min_eps:.2f}",
378
+ )
379
+ fig.update_layout(
380
+ title="k-Distance Graph",
381
+ xaxis_title="Index",
382
+ yaxis_title="Distance",
383
+ width=800,
384
+ height=600,
385
+ template="plotly_dark",
386
+ )
387
+ return fig, min_eps
388
+ return None, None
389
+
390
+
391
+ def find_most_similar_comment(cluster_data, avg_embedding):
392
+ similarities = [
393
+ 1 - cosine(avg_embedding, emb) for emb in cluster_data["embeddings"]
394
+ ]
395
+ most_similar_index = np.argmax(similarities)
396
+
397
+ return cluster_data.iloc[most_similar_index]["comment"]
398
+
399
+
400
+ def format_text(text, line_length=50):
401
+ """
402
+ Formatea el texto agregando saltos de línea cada 'line_length' caracteres.
403
+
404
+ :param text: El texto a formatear.
405
+ :param line_length: La longitud máxima de cada línea (por defecto 50 caracteres).
406
+ :return: El texto formateado con saltos de línea.
407
+ """
408
+ # Divide el texto en partes de longitud 'line_length'
409
+ formatted_text = "<br>".join(
410
+ text[i : i + line_length] for i in range(0, len(text), line_length)
411
+ )
412
+ return formatted_text
413
+
414
+
415
+ def replace_html_entities(text):
416
+ """
417
+ Reemplaza entidades HTML conocidas en el texto con sus caracteres correspondientes.
418
+
419
+ :param text: El texto con entidades HTML.
420
+ :return: El texto con las entidades reemplazadas.
421
+ """
422
+ replacements = {
423
+ "&quot;": '"',
424
+ "&amp;": "&",
425
+ "&lt;": "<",
426
+ "&gt;": ">",
427
+ "<br>": "\n", # Reemplazar <br> con salto de línea
428
+ }
429
+
430
+ for entity, char in replacements.items():
431
+ text = text.replace(entity, char)
432
+
433
+ return text
434
+
435
+
436
+ def plot_sentiment_global(
437
+ data,
438
+ sentimiento_col="sentimiento",
439
+ title="Evolución de Comentarios por Sentimiento",
440
+ width=1200,
441
+ height=600,
442
+ ):
443
+ """"""
444
+ df_global = data[sentimiento_col].value_counts().reset_index()
445
+ df_global.columns = [sentimiento_col, "count"]
446
+
447
+ fig_global = go.Figure()
448
+
449
+ color_palette = {"positivo": "#138d75", "negativo": "#a93226", "neutro": "#909497"}
450
+
451
+ for sentimiento in df_global[sentimiento_col].unique():
452
+ df_sentimiento = df_global[df_global[sentimiento_col] == sentimiento]
453
+ fig_global.add_trace(
454
+ go.Bar(
455
+ x=df_sentimiento[sentimiento_col],
456
+ y=df_sentimiento["count"],
457
+ text=df_sentimiento["count"],
458
+ textposition="inside",
459
+ insidetextanchor="middle",
460
+ name=sentimiento,
461
+ marker=dict(color=color_palette[sentimiento]),
462
+ )
463
+ )
464
+
465
+ fig_global.update_layout(
466
+ title=f"{title} - Global",
467
+ xaxis_title="Sentimiento",
468
+ yaxis_title="Número Total de Comentarios",
469
+ legend_title="Sentimiento",
470
+ template="plotly_dark",
471
+ width=width,
472
+ height=height,
473
+ )
474
+
475
+ return fig_global
476
+
477
+
478
+ def plot_sentiment_daily(
479
+ data,
480
+ fecha_col="published_at",
481
+ sentimiento_col="sentimiento",
482
+ title="Evolución de Comentarios por Sentimiento",
483
+ width=1200,
484
+ height=600,
485
+ ):
486
+ """"""
487
+ data[fecha_col] = pd.to_datetime(data[fecha_col])
488
+
489
+ df_grouped = (
490
+ data.groupby([pd.Grouper(key=fecha_col, freq="D"), sentimiento_col])
491
+ .size()
492
+ .reset_index(name="count")
493
+ )
494
+
495
+ df_grouped["total_daily"] = df_grouped.groupby(pd.Grouper(key=fecha_col, freq="D"))[
496
+ "count"
497
+ ].transform("sum")
498
+ df_grouped["percentage"] = df_grouped["count"] / df_grouped["total_daily"] * 100
499
+
500
+ fig_daily = go.Figure()
501
+
502
+ color_palette = {"positivo": "#138d75", "negativo": "#a93226", "neutro": "#909497"}
503
+
504
+ for sentimiento in data[sentimiento_col].unique():
505
+ df_sentimiento = df_grouped[df_grouped[sentimiento_col] == sentimiento]
506
+ fig_daily.add_trace(
507
+ go.Bar(
508
+ x=df_sentimiento[fecha_col],
509
+ y=df_sentimiento["total_daily"],
510
+ name=sentimiento,
511
+ text=df_sentimiento["count"],
512
+ texttemplate="%{text}",
513
+ textposition="inside",
514
+ insidetextanchor="middle",
515
+ customdata=df_sentimiento["percentage"],
516
+ hovertemplate="<b>Fecha</b>: %{x}<br><b>Sentimiento</b>: %{name}<br><b>Porcentaje</b>: %{customdata:.1f}%<br><b>Total de Comentarios</b>: %{text}<extra></extra>", # Información emergente con porcentaje y total
517
+ marker=dict(color=color_palette[sentimiento]),
518
+ )
519
+ )
520
+
521
+ fig_daily.update_layout(
522
+ title=f"{title} - Por Día",
523
+ xaxis_title="Fecha",
524
+ yaxis_title="Total de Comentarios",
525
+ legend_title="Sentimiento",
526
+ barmode="stack",
527
+ template="plotly_dark",
528
+ width=width,
529
+ height=height,
530
+ )
531
+
532
+ return fig_daily
533
+
534
+
535
+ def create_3d_umap_plot(data):
536
+
537
+ def calculate_sentiment_info(data):
538
+ cluster_sentiments = (
539
+ data.groupby("Cluster")["sentimiento"].value_counts().unstack(fill_value=0)
540
+ )
541
+ total_by_cluster = cluster_sentiments.sum(axis=1)
542
+ sentiment_percentages = (
543
+ cluster_sentiments.div(total_by_cluster, axis=0) * 100
544
+ ).round(2)
545
+
546
+ sentiment_info = {}
547
+ for cluster in total_by_cluster.index:
548
+ info = [
549
+ f"{sentiment}: {count} ({percent}%)"
550
+ for sentiment, count, percent in zip(
551
+ cluster_sentiments.columns,
552
+ cluster_sentiments.loc[cluster],
553
+ sentiment_percentages.loc[cluster],
554
+ )
555
+ ]
556
+ sentiment_info[cluster] = (
557
+ f"Total {total_by_cluster[cluster]}<br>" + "<br>".join(info)
558
+ )
559
+
560
+ return sentiment_info
561
+
562
+ fig = go.Figure()
563
+
564
+ fig.add_trace(
565
+ go.Scatter3d(
566
+ x=data["UMAP1"],
567
+ y=data["UMAP2"],
568
+ z=data["UMAP3"],
569
+ mode="markers",
570
+ marker=dict(
571
+ size=3,
572
+ color=data["Cluster"],
573
+ colorscale="Viridis",
574
+ colorbar=dict(title="Cluster"),
575
+ ),
576
+ text=data["sentimiento"],
577
+ name="Puntos",
578
+ )
579
+ )
580
+
581
+ fig.update_layout(
582
+ scene=dict(xaxis_title="UMAP 1", yaxis_title="UMAP 2", zaxis_title="UMAP 3"),
583
+ template="plotly_dark",
584
+ title="Visualización 3D con UMAP y Clustering",
585
+ )
586
+
587
+ sentiment_info = calculate_sentiment_info(data)
588
+
589
+ hovertemplate = (
590
+ "Cluster: %{marker.color}<br>"
591
+ + data["Cluster"].map(sentiment_info)
592
+ + "<br>"
593
+ + "<extra></extra>"
594
+ )
595
+
596
+ fig.update_traces(hovertemplate=hovertemplate)
597
+
598
+ fig.show()
599
+
600
+
601
+ def perform_clustering(data, min_eps, max_eps=0.95, n=5, embeddings_col="embeddings"):
602
+
603
+ embeddings_matrix = np.array(data[embeddings_col].tolist())
604
+ # threshold_values = np.round(np.linspace(min_eps, max_eps, n), 2)
605
+ threshold_values = np.linspace(min_eps, max_eps, n)
606
+
607
+ cluster_assignments = {}
608
+ cluster_counts = {}
609
+ calinski_harabasz_scores = {}
610
+ silhouette_scores = {}
611
+ most_similar_comments = {}
612
+
613
+ for distance_threshold in threshold_values:
614
+ clustering = AgglomerativeClustering(
615
+ n_clusters=None,
616
+ distance_threshold=distance_threshold,
617
+ linkage="complete",
618
+ metric="cosine",
619
+ )
620
+ data[f"cluster_{distance_threshold}"] = clustering.fit_predict(
621
+ embeddings_matrix
622
+ )
623
+ cluster_assignments[distance_threshold] = data[f"cluster_{distance_threshold}"]
624
+ cluster_counts[distance_threshold] = data[
625
+ f"cluster_{distance_threshold}"
626
+ ].value_counts()
627
+ labels = data[f"cluster_{distance_threshold}"]
628
+
629
+ # Calcular Calinski-Harabasz Score
630
+ if len(np.unique(labels)) > 1:
631
+ # Recalcular matriz de distancias con base en los clusters
632
+ euclidean_distances = pairwise_distances(
633
+ embeddings_matrix, metric="euclidean"
634
+ )
635
+ ch_score = calinski_harabasz_score(euclidean_distances, labels)
636
+ else:
637
+ ch_score = -1 # Valor predeterminado si solo hay un clúster
638
+ calinski_harabasz_scores[distance_threshold] = ch_score
639
+
640
+ # Calcular Silhouette Score
641
+ if len(np.unique(labels)) > 1:
642
+ sil_score = silhouette_score(embeddings_matrix, labels, metric="cosine")
643
+ else:
644
+ sil_score = -1 # Valor predeterminado si solo hay un clúster
645
+ silhouette_scores[distance_threshold] = sil_score
646
+
647
+ # Placeholder for finding the most similar comment function
648
+ most_similar_comments[distance_threshold] = {}
649
+ for cluster_id in np.unique(labels):
650
+ cluster_data = data[data[f"cluster_{distance_threshold}"] == cluster_id]
651
+ avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
652
+ # Replace with your actual implementation
653
+ most_similar_comment = find_most_similar_comment(
654
+ cluster_data, avg_embedding
655
+ )
656
+ most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
657
+
658
+ return (
659
+ cluster_assignments,
660
+ cluster_counts,
661
+ calinski_harabasz_scores,
662
+ silhouette_scores,
663
+ most_similar_comments,
664
+ data,
665
+ )
666
+
667
+
668
+ def build_sankey_data(
669
+ cluster_assignments,
670
+ cluster_counts,
671
+ most_similar_comments,
672
+ min_items_by_cluster=10,
673
+ ):
674
+ labels = []
675
+ source = []
676
+ target = []
677
+ values = []
678
+ comments = []
679
+
680
+ threshold_values = sorted(cluster_assignments.keys())
681
+ valid_clusters = {}
682
+
683
+ for threshold in threshold_values:
684
+ valid_clusters[threshold] = [
685
+ j
686
+ for j in np.unique(cluster_assignments[threshold])
687
+ if cluster_counts[threshold].get(j, 0) >= min_items_by_cluster
688
+ ]
689
+
690
+ for i, threshold in enumerate(threshold_values):
691
+ for j in valid_clusters[threshold]:
692
+ cluster_name = (
693
+ f"{j} (d={threshold})\nTotal: {cluster_counts[threshold].get(j, 0)}"
694
+ )
695
+ if cluster_name not in labels:
696
+ labels.append(cluster_name)
697
+ comments.append(
698
+ format_text(
699
+ replace_html_entities(
700
+ most_similar_comments[threshold].get(j, "N/A")
701
+ )
702
+ )
703
+ )
704
+
705
+ if i > 0:
706
+ prev_threshold = threshold_values[i - 1]
707
+ for prev_cluster in valid_clusters[prev_threshold]:
708
+ for curr_cluster in valid_clusters[threshold]:
709
+ count = np.sum(
710
+ (cluster_assignments[prev_threshold] == prev_cluster)
711
+ & (cluster_assignments[threshold] == curr_cluster)
712
+ )
713
+ if count > 0:
714
+ source_idx = labels.index(
715
+ f"{prev_cluster} (d={prev_threshold})\nTotal: {cluster_counts[prev_threshold].get(prev_cluster, 0)}"
716
+ )
717
+ target_idx = labels.index(
718
+ f"{curr_cluster} (d={threshold})\nTotal: {cluster_counts[threshold].get(curr_cluster, 0)}"
719
+ )
720
+ source.append(source_idx)
721
+ target.append(target_idx)
722
+ values.append(count)
723
+
724
+ return (labels, source, target, values, comments)
725
+
726
+
727
+ def plot_sankey(labels, source, target, values, comments, width=None, height=None):
728
+ fig = go.Figure(
729
+ go.Sankey(
730
+ node=dict(
731
+ pad=15,
732
+ thickness=20,
733
+ line=dict(color="black", width=0),
734
+ label=labels,
735
+ hovertemplate="<b>%{label}</b><br>"
736
+ + "<br><b>Commentario:</b><br>%{customdata}<extra></extra>",
737
+ customdata=comments,
738
+ ),
739
+ link=dict(
740
+ source=source,
741
+ target=target,
742
+ value=values,
743
+ hovertemplate="<extra></extra>",
744
+ ),
745
+ )
746
+ )
747
+ fig.update_layout(
748
+ title_text="Sankey Diagram of Agglomerative Clustering Transitions",
749
+ font_size=14,
750
+ width=width,
751
+ height=height,
752
+ template="plotly_dark",
753
+ )
754
+
755
+ return fig
756
+
757
+
758
+ def plot_clustering_metric(silhouette_scores, calinski_scores):
759
+ """
760
+ Genera un gráfico que muestra los puntajes de silhouette y Calinski-Harabasz frente a los umbrales de distancia,
761
+ con dos ejes Y diferentes y marca el umbral con el mejor puntaje de silhouette.
762
+
763
+ Args:
764
+ silhouette_scores (dict): Un diccionario donde las claves son umbrales de distancia
765
+ y los valores son puntajes de silhouette correspondientes.
766
+ calinski_scores (dict): Un diccionario donde las claves son umbrales de distancia
767
+ y los valores son puntajes de Calinski-Harabasz correspondientes.
768
+
769
+ Returns:
770
+ fig (plotly.graph_objects.Figure): Un objeto Figure de Plotly con el gráfico generado.
771
+ """
772
+ # Obtener los umbrales de distancia y puntajes
773
+ silhouette_thresholds = sorted(silhouette_scores.keys())
774
+ silhouette_metric_scores = [silhouette_scores[t] for t in silhouette_thresholds]
775
+
776
+ calinski_thresholds = sorted(calinski_scores.keys())
777
+ calinski_metric_scores = [calinski_scores[t] for t in calinski_thresholds]
778
+
779
+ # Determinar el mejor umbral basado en el puntaje más alto de silhouette
780
+ best_threshold = max(silhouette_scores, key=silhouette_scores.get)
781
+
782
+ # Crear el gráfico con dos ejes Y
783
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
784
+
785
+ # Añadir la traza para el puntaje de silhouette
786
+ fig.add_trace(
787
+ go.Scatter(
788
+ x=silhouette_thresholds,
789
+ y=silhouette_metric_scores,
790
+ mode="lines+markers",
791
+ name="Silhouette Score",
792
+ marker=dict(color="red", size=10),
793
+ line=dict(color="red", width=2),
794
+ text=[
795
+ f"Threshold: {t}<br>Silhouette Score: {s}"
796
+ for t, s in zip(silhouette_thresholds, silhouette_metric_scores)
797
+ ],
798
+ hoverinfo="text",
799
+ ),
800
+ secondary_y=False, # Eje Y izquierdo
801
+ )
802
+
803
+ # Añadir la traza para el puntaje de Calinski-Harabasz
804
+ fig.add_trace(
805
+ go.Scatter(
806
+ x=calinski_thresholds,
807
+ y=calinski_metric_scores,
808
+ mode="lines+markers",
809
+ name="Calinski-Harabasz Score",
810
+ marker=dict(color="blue", size=10),
811
+ line=dict(color="blue", width=2),
812
+ text=[
813
+ f"Threshold: {t}<br>Calinski-Harabasz Score: {s}"
814
+ for t, s in zip(calinski_thresholds, calinski_metric_scores)
815
+ ],
816
+ hoverinfo="text",
817
+ ),
818
+ secondary_y=True, # Eje Y derecho
819
+ )
820
+
821
+ # Añadir una línea vertical para el mejor umbral
822
+ fig.add_vline(
823
+ x=best_threshold,
824
+ line=dict(color="green", width=2, dash="dash"),
825
+ annotation_text=f"Best Threshold: {best_threshold}",
826
+ annotation_position="top right",
827
+ )
828
+
829
+ # Configurar el diseño del gráfico
830
+ fig.update_layout(
831
+ title="Clustering Metrics vs. Threshold Distance",
832
+ xaxis_title="Threshold Distance",
833
+ yaxis_title="Silhouette Score",
834
+ yaxis2_title="Calinski-Harabasz Score",
835
+ font=dict(size=12),
836
+ width=800,
837
+ height=600,
838
+ template="plotly_dark",
839
+ )
840
+
841
+ return fig, best_threshold
842
+
843
+
844
+ classifier = pipeline(
845
+ "sentiment-analysis",
846
+ model="nlptown/bert-base-multilingual-uncased-sentiment",
847
+ truncation=True,
848
+ )
849
+
850
+
851
+ def map_sentiment(estrella):
852
+ if estrella in ["1 star", "2 stars"]:
853
+ return "negativo"
854
+ elif estrella == "3 stars":
855
+ return "neutro"
856
+ elif estrella in ["4 stars", "5 stars"]:
857
+ return "positivo"
858
+
859
+
860
+ def classify_sentiment(texto):
861
+ resultado = classifier(texto)[0]
862
+ sentimiento = map_sentiment(resultado["label"])
863
+ return (
864
+ sentimiento,
865
+ resultado["score"],
866
+ )
867
+
868
+
869
+ def classify_sentiment_df(data, comment_col="comment"):
870
+
871
+ def classify_sentiment(texto):
872
+ resultado = classifier(texto)[0]
873
+ sentimiento = map_sentiment(resultado["label"])
874
+ return sentimiento, resultado["score"]
875
+
876
+ data["sentimiento"], data["confianza"] = zip(
877
+ *data[comment_col].apply(classify_sentiment)
878
+ )
879
+
880
+ return data
881
+
882
+
883
+ def transform_embeddings(
884
+ data, embeddings_col="embeddings", n_components=3, random_seed=42
885
+ ):
886
+ # Convertir embeddings a matriz numpy
887
+ embeddings_matrix = np.array(data[embeddings_col].tolist())
888
+
889
+ # Aplicar UMAP para reducción de dimensionalidad
890
+ umap_model = umap.UMAP(
891
+ n_components=n_components, random_state=random_seed, metric="cosine"
892
+ )
893
+ data_umap = umap_model.fit_transform(embeddings_matrix)
894
+
895
+ # Calcular distancias y percentiles para determinar min_eps y max_eps
896
+ distances = pairwise_distances(data_umap, metric="cosine")
897
+ min_eps = np.percentile(distances, 10)
898
+ max_eps = np.percentile(distances, 50)
899
+
900
+ umap_data = pd.DataFrame(
901
+ {"embeddings": [embedding.tolist() for embedding in data_umap]}
902
+ )
903
+ umap_data["comment"] = data["comment"]
904
+
905
+ return umap_data, min_eps, max_eps
906
+
907
+
908
+ def determine_min_items_by_cluster(total):
909
+ """ """
910
+ if total < 50:
911
+ min_items_by_cluster = 1
912
+ elif total < 100:
913
+ min_items_by_cluster = 5
914
+ elif total < 500:
915
+ min_items_by_cluster = 10
916
+ else:
917
+ min_items_by_cluster = int(round(total * 0.01, 2))
918
+
919
+ return min_items_by_cluster
920
+
921
+
922
+ def main(): ...
923
+
924
+
925
+ if __name__ == "__main__":
926
+ main()