ProyectoDS-AnalizaTube / clustering.py
Kuautli's picture
Update clustering.py
8ee8a76 verified
raw
history blame
28.8 kB
import os
import re
import unicodedata
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import umap
from dotenv import load_dotenv
from googleapiclient.discovery import build
from plotly.subplots import make_subplots
from scipy.spatial.distance import cosine
from sentence_transformers import SentenceTransformer
from sklearn import set_config
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import (
calinski_harabasz_score,
pairwise_distances,
silhouette_score,
)
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from transformers import pipeline
from wordcloud import WordCloud
from concurrent.futures import ThreadPoolExecutor
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
def log_message(message):
""""""
logging.info(message)
try:
import torch
device = 0 if torch.cuda.is_available() else -1
embeddings_device = "cuda"
batch_size = 128
except ImportError:
device = -1 # Si no está instalado, forzar uso de CPU
embeddings_device = "cpu"
batch_size = 32
api_key = os.getenv("youtube_api_key")
RANDOM_STATE = 333
stopwords_es = [
"a",
"al",
"algo",
"algún",
"alguna",
"algunas",
"alguno",
"algunos",
"ante",
"antes",
"bajo",
"bastante",
"bien",
"cada",
"casi",
"como",
"con",
"cuanto",
"de",
"del",
"desde",
"donde",
"durante",
"el",
"ella",
"ellos",
"en",
"encima",
"ese",
"eso",
"esta",
"estas",
"este",
"estos",
"fuera",
"hay",
"la",
"las",
"le",
"lo",
"los",
"más",
"me",
"mi",
"mí",
"menos",
"mismo",
"mucho",
"muy",
"nada",
"ni",
"no",
"nos",
"nuestro",
"nuestra",
"o",
"os",
"para",
"pero",
"poco",
"por",
"que",
"quien",
"si",
"sólo",
"sobre",
"su",
"sus",
"te",
"tu",
"tus",
"un",
"una",
"unas",
"uno",
"unos",
"vos",
"ya",
"yo",
"además",
"alrededor",
"aún",
"bajo",
"bien",
"cada",
"cierta",
"ciertas",
"como",
"con",
"de",
"debe",
"dentro",
"dos",
"ella",
"en",
"entonces",
"entre",
"esa",
"esos",
"está",
"hasta",
"incluso",
"lejos",
"lo",
"luego",
"medio",
"mientras",
"muy",
"nunca",
"o",
"otro",
"para",
"pero",
"poco",
"por",
"se",
"si",
"sin",
"sobre",
"tan",
"te",
"ten",
"tendría",
"todos",
"total",
"un",
"una",
"uno",
"ustedes",
"yo",
"y",
"es",
"son",
"solo",
"les",
]
def normalize_text(text):
text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")
text = text.lower()
return text
def remove_stopwords(text, stopwords):
# Divide el texto en palabras y elimina las stopwords
return [word for word in text.split() if word not in stopwords]
def plot_wordcloud(data, text_column, output_filename=None):
text = " ".join(data[text_column])
stopwords_set = set(stopwords_es)
normalized_text = normalize_text(text)
cleaned_text = remove_stopwords(normalized_text, stopwords_set)
filtered_text = replace_html_entities(" ".join(cleaned_text))
# Crear la nube de palabras usando los conteos
wordcloud = WordCloud(
width=800, height=400, background_color="white", normalize_plurals=True
).generate(filtered_text)
# Mostrar la nube de palabras
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
if output_filename:
plt.savefig(output_filename, format="png")
plt.close()
return output_filename
def extract_video_id(url):
"""
Extrae el video_id de una URL de YouTube.
Parámetros:
- url: str, la URL del video de YouTube.
Retorna:
- video_id: str, el identificador del video de YouTube.
"""
# Expresión regular para encontrar el video_id en una URL de YouTube
pattern = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})"
match = re.search(pattern, url)
if match:
return match.group(1)
else:
raise ValueError("No se pudo encontrar un ID de video en la URL proporcionada.")
def get_youtube_video_details(url, api_key):
"""
Obtiene detalles de un video de YouTube usando la API de YouTube Data v3.
:param video_id: ID del video de YouTube.
:param api_key: Clave de API de YouTube Data v3.
:return: Un diccionario con el nombre del video, el canal, el número de vistas y el número de comentarios.
"""
try:
youtube = build("youtube", "v3", developerKey=api_key)
video_id = extract_video_id(url)
request = youtube.videos().list(part="snippet,statistics", id=video_id)
response = request.execute()
if "items" in response and len(response["items"]) > 0:
video = response["items"][0]
details = {
"title": video["snippet"]["title"],
"channel_title": video["snippet"]["channelTitle"],
"view_count": video["statistics"].get("viewCount", "No disponible"),
"comment_count": video["statistics"].get(
"commentCount", "No disponible"
),
}
return details
else:
return {"error": "No se encontró el video con el ID proporcionado."}
except Exception as e:
return {"error": str(e)}
def get_youtube_comments(api_key, url, max_results=100):
"""
Obtiene comentarios de un video de YouTube y los convierte en un DataFrame de pandas.
Parámetros:
- api_key: str, la clave de API de YouTube.
- video_id: str, el ID del video de YouTube.
- max_results: int, el número máximo de comentarios a obtener por solicitud (predeterminado es 100).
Retorna:
- df: pandas DataFrame, contiene los comentarios del video.
"""
# Crear el servicio de la API de YouTube
youtube = build("youtube", "v3", developerKey=api_key)
# Solicitar los comentarios del video
video_id = extract_video_id(url)
request = youtube.commentThreads().list(
part="snippet", videoId=video_id, maxResults=max_results
)
response = request.execute()
# Lista para almacenar los datos de los comentarios
comments_data = []
# Procesar y almacenar los comentarios en la lista
for item in response["items"]:
comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
published_at = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
comments_data.append(
{"author": author, "comment": comment, "published_at": published_at}
)
# Paginar y obtener más comentarios si hay más disponibles
next_page_token = response.get("nextPageToken")
while next_page_token:
request = youtube.commentThreads().list(
part="snippet",
videoId=video_id,
pageToken=next_page_token,
maxResults=max_results,
)
response = request.execute()
for item in response["items"]:
comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
author = item["snippet"]["topLevelComment"]["snippet"]["authorDisplayName"]
published_at = item["snippet"]["topLevelComment"]["snippet"]["publishedAt"]
comments_data.append(
{"author": author, "comment": comment, "published_at": published_at}
)
next_page_token = response.get("nextPageToken")
# Convertir la lista de comentarios en un DataFrame de pandas
df = pd.DataFrame(comments_data)
return df
def add_normalized_embeddings_to_dataframe(
data, text_column, model_name="paraphrase-multilingual-MiniLM-L12-v2"
):
"""
Genera y normaliza embeddings para una columna de texto en un DataFrame y agrega estos embeddings como nuevas columnas.
Parámetros:
- data: pandas DataFrame, el DataFrame que contiene la columna de texto.
- text_column: str, el nombre de la columna en el DataFrame que contiene el texto para generar embeddings.
- model_name: str, el nombre del modelo de SentenceTransformer a utilizar.
- batch_size: int, el tamaño del lote para procesamiento eficiente.
Retorna:
- data: pandas DataFrame, el DataFrame original con las nuevas columnas de embeddings normalizados.
"""
model = SentenceTransformer(model_name, device=embeddings_device)
sentences = data[text_column].tolist()
embeddings = model.encode(sentences, batch_size=batch_size, convert_to_numpy=True, device=device)
normalized_embeddings = normalize(embeddings, norm="l2")
data["embeddings"] = list(normalized_embeddings)
return data
def plot_k_distance(data, threshold=0.01, quantile=0.95):
# embeddings_matrix = np.array(data["embeddings"].tolist())
embeddings_matrix = data.copy()
for threshold in [threshold, 0.05, 0.1, 0.2]:
min_samples = int(round(data.shape[0] * threshold, 0))
n_neighbors = min_samples - 1
if n_neighbors > 2:
nn = NearestNeighbors(
n_neighbors=n_neighbors, algorithm="auto", metric="cosine", n_jobs=-1
)
nn.fit(embeddings_matrix)
distances, _ = nn.kneighbors(embeddings_matrix)
k_distances = distances[:, -1]
min_eps = np.percentile(k_distances, quantile * 100)
k_distances = np.sort(k_distances)
fig = go.Figure()
fig.add_trace(go.Scatter(y=k_distances, mode="lines", name="k-distances"))
fig.add_hline(
y=min_eps,
line=dict(color="red", dash="dash"),
name=f"min_eps = {min_eps:.2f}",
)
fig.update_layout(
title="k-Distance Graph",
xaxis_title="Index",
yaxis_title="Distance",
width=800,
height=600,
template="plotly_dark",
)
return fig, min_eps
return None, None
def find_most_similar_comment(cluster_data, avg_embedding):
similarities = [
1 - cosine(avg_embedding, emb) for emb in cluster_data["embeddings"]
]
most_similar_index = np.argmax(similarities)
return cluster_data.iloc[most_similar_index]["comment"]
def format_text(text, line_length=50):
"""
Formatea el texto agregando saltos de línea cada 'line_length' caracteres.
:param text: El texto a formatear.
:param line_length: La longitud máxima de cada línea (por defecto 50 caracteres).
:return: El texto formateado con saltos de línea.
"""
# Divide el texto en partes de longitud 'line_length'
formatted_text = "<br>".join(
text[i : i + line_length] for i in range(0, len(text), line_length)
)
return formatted_text
def replace_html_entities(text):
"""
Reemplaza entidades HTML conocidas en el texto con sus caracteres correspondientes.
:param text: El texto con entidades HTML.
:return: El texto con las entidades reemplazadas.
"""
replacements = {
"&quot;": '"',
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"<br>": "\n", # Reemplazar <br> con salto de línea
}
for entity, char in replacements.items():
text = text.replace(entity, char)
return text
def plot_sentiment_global(
data,
sentimiento_col="sentimiento",
title="Evolución de Comentarios por Sentimiento",
width=1200,
height=600,
):
""""""
df_global = data[sentimiento_col].value_counts().reset_index()
df_global.columns = [sentimiento_col, "count"]
fig_global = go.Figure()
color_palette = {"positivo": "#138d75", "negativo": "#a93226", "neutro": "#909497"}
for sentimiento in df_global[sentimiento_col].unique():
df_sentimiento = df_global[df_global[sentimiento_col] == sentimiento]
fig_global.add_trace(
go.Bar(
x=df_sentimiento[sentimiento_col],
y=df_sentimiento["count"],
text=df_sentimiento["count"],
textposition="inside",
insidetextanchor="middle",
name=sentimiento,
marker=dict(color=color_palette[sentimiento]),
)
)
fig_global.update_layout(
title=f"{title} - Global",
xaxis_title="Sentimiento",
yaxis_title="Número Total de Comentarios",
legend_title="Sentimiento",
template="plotly_dark",
width=width,
height=height,
)
return fig_global
def plot_sentiment_daily(
data,
fecha_col="published_at",
sentimiento_col="sentimiento",
title="Evolución de Comentarios por Sentimiento",
width=1200,
height=600,
):
""""""
data[fecha_col] = pd.to_datetime(data[fecha_col])
df_grouped = (
data.groupby([pd.Grouper(key=fecha_col, freq="D"), sentimiento_col])
.size()
.reset_index(name="count")
)
df_grouped["total_daily"] = df_grouped.groupby(pd.Grouper(key=fecha_col, freq="D"))[
"count"
].transform("sum")
df_grouped["percentage"] = df_grouped["count"] / df_grouped["total_daily"] * 100
fig_daily = go.Figure()
color_palette = {"positivo": "#138d75", "negativo": "#a93226", "neutro": "#909497"}
for sentimiento in data[sentimiento_col].unique():
df_sentimiento = df_grouped[df_grouped[sentimiento_col] == sentimiento]
fig_daily.add_trace(
go.Bar(
x=df_sentimiento[fecha_col],
y=df_sentimiento["total_daily"],
name=sentimiento,
text=df_sentimiento["count"],
texttemplate="%{text}",
textposition="inside",
insidetextanchor="middle",
customdata=df_sentimiento["percentage"],
hovertemplate="<b>Fecha</b>: %{x}<br><b>Sentimiento</b>: %{name}<br><b>Porcentaje</b>: %{customdata:.1f}%<br><b>Total de Comentarios</b>: %{text}<extra></extra>", # Información emergente con porcentaje y total
marker=dict(color=color_palette[sentimiento]),
)
)
fig_daily.update_layout(
title=f"{title} - Por Día",
xaxis_title="Fecha",
yaxis_title="Total de Comentarios",
legend_title="Sentimiento",
barmode="stack",
template="plotly_dark",
width=width,
height=height,
)
return fig_daily
def create_3d_umap_plot(data):
def calculate_sentiment_info(data):
cluster_sentiments = (
data.groupby("Cluster")["sentimiento"].value_counts().unstack(fill_value=0)
)
total_by_cluster = cluster_sentiments.sum(axis=1)
sentiment_percentages = (
cluster_sentiments.div(total_by_cluster, axis=0) * 100
).round(2)
sentiment_info = {}
for cluster in total_by_cluster.index:
info = [
f"{sentiment}: {count} ({percent}%)"
for sentiment, count, percent in zip(
cluster_sentiments.columns,
cluster_sentiments.loc[cluster],
sentiment_percentages.loc[cluster],
)
]
sentiment_info[cluster] = (
f"Total {total_by_cluster[cluster]}<br>" + "<br>".join(info)
)
return sentiment_info
fig = go.Figure()
fig.add_trace(
go.Scatter3d(
x=data["UMAP1"],
y=data["UMAP2"],
z=data["UMAP3"],
mode="markers",
marker=dict(
size=3,
color=data["Cluster"],
colorscale="Viridis",
colorbar=dict(title="Cluster"),
),
text=data["sentimiento"],
name="Puntos",
)
)
fig.update_layout(
scene=dict(xaxis_title="UMAP 1", yaxis_title="UMAP 2", zaxis_title="UMAP 3"),
template="plotly_dark",
title="Visualización 3D con UMAP y Clustering",
)
sentiment_info = calculate_sentiment_info(data)
hovertemplate = (
"Cluster: %{marker.color}<br>"
+ data["Cluster"].map(sentiment_info)
+ "<br>"
+ "<extra></extra>"
)
fig.update_traces(hovertemplate=hovertemplate)
fig.show()
def perform_clustering(
data, min_eps, max_eps=0.95, n=5, threshold_values=None, embeddings_col="embeddings"
):
embeddings_matrix = np.array(data[embeddings_col].tolist())
if threshold_values is None:
threshold_values = np.round(np.linspace(min_eps, max_eps, n), 6).astype(float)
log_message(f"perform_clustering {threshold_values}")
cluster_assignments = {}
cluster_counts = {}
calinski_harabasz_scores = {}
silhouette_scores = {}
most_similar_comments = {}
for distance_threshold in threshold_values:
log_message(distance_threshold)
distance_threshold = round(distance_threshold, 6)
clustering = AgglomerativeClustering(
n_clusters=None,
distance_threshold=distance_threshold,
linkage="complete",
metric="cosine",
)
# Formatear el nombre de la columna para incluir solo 6 decimales
cluster_name = f"cluster_{distance_threshold:.6f}"
data[cluster_name] = clustering.fit_predict(embeddings_matrix)
# Almacenar los resultados en las estructuras correspondientes
cluster_assignments[distance_threshold] = data[cluster_name]
cluster_counts[distance_threshold] = data[cluster_name].value_counts()
labels = data[cluster_name]
# Calcular Calinski-Harabasz Score
if len(np.unique(labels)) > 1:
# Recalcular matriz de distancias con base en los clusters
euclidean_distances = pairwise_distances(embeddings_matrix, metric="euclidean")
ch_score = calinski_harabasz_score(euclidean_distances, labels)
ch_score = round(ch_score, 2)
else:
ch_score = -1 # Valor predeterminado si solo hay un clúster
calinski_harabasz_scores[distance_threshold] = ch_score
# Calcular Silhouette Score
if len(np.unique(labels)) > 1:
sil_score = silhouette_score(embeddings_matrix, labels, metric="cosine")
sil_score = round(sil_score, 2)
else:
sil_score = -1 # Valor predeterminado si solo hay un clúster
silhouette_scores[distance_threshold] = sil_score
# Placeholder for finding the most similar comment function
most_similar_comments[distance_threshold] = {}
for cluster_id in np.unique(labels):
cluster_data = data[data[cluster_name] == cluster_id]
avg_embedding = np.mean(cluster_data[embeddings_col].tolist(), axis=0)
# Reemplazar con tu implementación real
most_similar_comment = find_most_similar_comment(cluster_data, avg_embedding)
most_similar_comments[distance_threshold][cluster_id] = most_similar_comment
return (
cluster_assignments,
cluster_counts,
calinski_harabasz_scores,
silhouette_scores,
most_similar_comments,
data,
)
def build_sankey_data(
cluster_assignments,
cluster_counts,
most_similar_comments,
min_items_by_cluster=10,
):
labels = []
source = []
target = []
values = []
comments = []
sorted_threshold_values = sorted(cluster_assignments.keys())
log_message(f"build_sankey_data {sorted_threshold_values}")
valid_clusters = {}
for threshold in sorted_threshold_values:
print(threshold)
valid_clusters[threshold] = [
j
for j in np.unique(cluster_assignments[threshold])
if cluster_counts[threshold].get(j, 0) >= min_items_by_cluster
]
for i, threshold in enumerate(sorted_threshold_values):
for j in valid_clusters[threshold]:
cluster_name = (
f"{j} (d={threshold})\nTotal: {cluster_counts[threshold].get(j, 0)}"
)
if cluster_name not in labels:
labels.append(cluster_name)
comments.append(
format_text(
replace_html_entities(
most_similar_comments[threshold].get(j, "N/A")
)
)
)
if i > 0:
prev_threshold = sorted_threshold_values[i - 1]
for prev_cluster in valid_clusters[prev_threshold]:
for curr_cluster in valid_clusters[threshold]:
count = np.sum(
(cluster_assignments[prev_threshold] == prev_cluster)
& (cluster_assignments[threshold] == curr_cluster)
)
if count > 0:
source_idx = labels.index(
f"{prev_cluster} (d={prev_threshold})\nTotal: {cluster_counts[prev_threshold].get(prev_cluster, 0)}"
)
target_idx = labels.index(
f"{curr_cluster} (d={threshold})\nTotal: {cluster_counts[threshold].get(curr_cluster, 0)}"
)
source.append(source_idx)
target.append(target_idx)
values.append(count)
return (labels, source, target, values, comments)
def plot_sankey(labels, source, target, values, comments, width=None, height=None):
fig = go.Figure(
go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color="black", width=0),
label=labels,
hovertemplate="<b>%{label}</b><br>"
+ "<br><b>Commentario:</b><br>%{customdata}<extra></extra>",
customdata=comments,
),
link=dict(
source=source,
target=target,
value=values,
hovertemplate="<extra></extra>",
),
)
)
fig.update_layout(
title_text="Sankey Diagram of Agglomerative Clustering Transitions",
font_size=14,
width=width,
height=height,
template="plotly_dark",
)
return fig
def plot_clustering_metric(silhouette_scores, calinski_scores):
"""
Genera un gráfico que muestra los puntajes de silhouette y Calinski-Harabasz frente a los umbrales de distancia,
con dos ejes Y diferentes y marca el umbral con el mejor puntaje de silhouette.
Args:
silhouette_scores (dict): Un diccionario donde las claves son umbrales de distancia
y los valores son puntajes de silhouette correspondientes.
calinski_scores (dict): Un diccionario donde las claves son umbrales de distancia
y los valores son puntajes de Calinski-Harabasz correspondientes.
Returns:
fig (plotly.graph_objects.Figure): Un objeto Figure de Plotly con el gráfico generado.
"""
# Obtener los umbrales de distancia y puntajes
silhouette_thresholds = sorted(silhouette_scores.keys())
silhouette_metric_scores = [silhouette_scores[t] for t in silhouette_thresholds]
calinski_thresholds = sorted(calinski_scores.keys())
calinski_metric_scores = [calinski_scores[t] for t in calinski_thresholds]
# Determinar el mejor umbral basado en el puntaje más alto de silhouette
best_threshold = max(silhouette_scores, key=silhouette_scores.get)
# Crear el gráfico con dos ejes Y
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Añadir la traza para el puntaje de silhouette
fig.add_trace(
go.Scatter(
x=silhouette_thresholds,
y=silhouette_metric_scores,
mode="lines+markers",
name="Silhouette Score",
marker=dict(color="red", size=10),
line=dict(color="red", width=2),
text=[
f"Threshold: {t}<br>Silhouette Score: {s}"
for t, s in zip(silhouette_thresholds, silhouette_metric_scores)
],
hoverinfo="text",
),
secondary_y=False, # Eje Y izquierdo
)
# Añadir la traza para el puntaje de Calinski-Harabasz
fig.add_trace(
go.Scatter(
x=calinski_thresholds,
y=calinski_metric_scores,
mode="lines+markers",
name="Calinski-Harabasz Score",
marker=dict(color="blue", size=10),
line=dict(color="blue", width=2),
text=[
f"Threshold: {t}<br>Calinski-Harabasz Score: {s}"
for t, s in zip(calinski_thresholds, calinski_metric_scores)
],
hoverinfo="text",
),
secondary_y=True, # Eje Y derecho
)
# Añadir una línea vertical para el mejor umbral
fig.add_vline(
x=best_threshold,
line=dict(color="green", width=2, dash="dash"),
annotation_text=f"Best Threshold: {best_threshold}",
annotation_position="top right",
)
# Configurar el diseño del gráfico
fig.update_layout(
title="Clustering Metrics vs. Threshold Distance",
xaxis_title="Threshold Distance",
yaxis_title="Silhouette Score",
yaxis2_title="Calinski-Harabasz Score",
font=dict(size=12),
width=800,
height=600,
template="plotly_dark",
)
return fig, best_threshold
classifier = pipeline(
"sentiment-analysis",
model="nlptown/bert-base-multilingual-uncased-sentiment",
truncation=True,
device=device
)
def map_sentiment(estrella):
if estrella in ["1 star", "2 stars"]:
return "negativo"
elif estrella == "3 stars":
return "neutro"
elif estrella in ["4 stars", "5 stars"]:
return "positivo"
def classify_sentiment_df(data, comment_col="comment", batch_size=32, num_threads=8):
comentarios = data[comment_col].tolist()
if device == 0: # Si hay GPU, procesar en batch
resultados = classifier(comentarios, batch_size=batch_size)
data["sentimiento"] = [map_sentiment(r["label"]) for r in resultados]
data["confianza"] = [r["score"] for r in resultados]
else: # Si no hay GPU, usar CPU con hilos
def classify_sentiment(texto):
resultado = classifier(texto)[0]
return map_sentiment(resultado["label"]), resultado["score"]
with ThreadPoolExecutor(max_workers=num_threads) as executor:
resultados = list(executor.map(classify_sentiment, comentarios))
sentimientos, confianzas = zip(*resultados)
data["sentimiento"] = sentimientos
data["confianza"] = confianzas
return data
def transform_embeddings(
data, embeddings_col="embeddings", n_components=3, random_seed=42
):
# Convertir embeddings a matriz numpy
embeddings_matrix = np.array(data[embeddings_col].tolist())
# Aplicar UMAP para reducción de dimensionalidad
umap_model = umap.UMAP(
n_components=n_components, random_state=random_seed, metric="cosine"
)
data_umap = umap_model.fit_transform(embeddings_matrix)
# Calcular distancias y percentiles para determinar min_eps y max_eps
distances = pairwise_distances(data_umap, metric="cosine")
min_eps = np.percentile(distances, 10)
max_eps = np.percentile(distances, 50)
umap_data = pd.DataFrame(
{"embeddings": [embedding.tolist() for embedding in data_umap]}
)
umap_data["comment"] = data["comment"]
return umap_data, min_eps, max_eps
def determine_min_items_by_cluster(total):
""" """
if total < 50:
min_items_by_cluster = 1
elif total < 100:
min_items_by_cluster = 5
elif total < 500:
min_items_by_cluster = 10
else:
min_items_by_cluster = int(round(total * 0.01, 2))
return min_items_by_cluster
def main(): ...
if __name__ == "__main__":
main()