Spaces:
Runtime error
Runtime error
Update clustering.py
Browse files- clustering.py +31 -20
clustering.py
CHANGED
@@ -28,8 +28,15 @@ from wordcloud import WordCloud
|
|
28 |
|
29 |
from concurrent.futures import ThreadPoolExecutor
|
30 |
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
api_key = os.getenv("youtube_api_key")
|
35 |
|
@@ -338,18 +345,18 @@ def add_normalized_embeddings_to_dataframe(
|
|
338 |
Parámetros:
|
339 |
- data: pandas DataFrame, el DataFrame que contiene la columna de texto.
|
340 |
- text_column: str, el nombre de la columna en el DataFrame que contiene el texto para generar embeddings.
|
341 |
-
- model_name: str, el nombre del modelo de SentenceTransformer a utilizar
|
|
|
342 |
|
343 |
Retorna:
|
344 |
- data: pandas DataFrame, el DataFrame original con las nuevas columnas de embeddings normalizados.
|
345 |
"""
|
346 |
|
347 |
-
model = SentenceTransformer(model_name)
|
348 |
sentences = data[text_column].tolist()
|
349 |
-
embeddings = model.encode(sentences)
|
350 |
normalized_embeddings = normalize(embeddings, norm="l2")
|
351 |
-
|
352 |
-
data["embeddings"] = [embedding for embedding in normalized_embeddings]
|
353 |
|
354 |
return data
|
355 |
|
@@ -854,6 +861,7 @@ classifier = pipeline(
|
|
854 |
"sentiment-analysis",
|
855 |
model="nlptown/bert-base-multilingual-uncased-sentiment",
|
856 |
truncation=True,
|
|
|
857 |
)
|
858 |
|
859 |
|
@@ -866,22 +874,25 @@ def map_sentiment(estrella):
|
|
866 |
return "positivo"
|
867 |
|
868 |
|
869 |
-
def classify_sentiment_df(data, comment_col="comment", num_threads=8):
|
870 |
-
|
871 |
-
resultado = classifier(texto)[0]
|
872 |
-
sentimiento = map_sentiment(resultado["label"])
|
873 |
-
return sentimiento, resultado["score"]
|
874 |
|
875 |
-
|
876 |
-
|
877 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
878 |
|
879 |
-
|
880 |
-
|
881 |
-
results = list(executor.map(process_row, data[comment_col]))
|
882 |
|
883 |
-
|
884 |
-
|
|
|
885 |
|
886 |
return data
|
887 |
|
|
|
28 |
|
29 |
from concurrent.futures import ThreadPoolExecutor
|
30 |
|
31 |
+
try:
|
32 |
+
import torch
|
33 |
+
device = 0 if torch.cuda.is_available() else -1
|
34 |
+
embeddings_device = "cuda"
|
35 |
+
batch_size = 128
|
36 |
+
except ImportError:
|
37 |
+
device = -1 # Si no está instalado, forzar uso de CPU
|
38 |
+
embeddings_device = "cpu"
|
39 |
+
batch_size = 32
|
40 |
|
41 |
api_key = os.getenv("youtube_api_key")
|
42 |
|
|
|
345 |
Parámetros:
|
346 |
- data: pandas DataFrame, el DataFrame que contiene la columna de texto.
|
347 |
- text_column: str, el nombre de la columna en el DataFrame que contiene el texto para generar embeddings.
|
348 |
+
- model_name: str, el nombre del modelo de SentenceTransformer a utilizar.
|
349 |
+
- batch_size: int, el tamaño del lote para procesamiento eficiente.
|
350 |
|
351 |
Retorna:
|
352 |
- data: pandas DataFrame, el DataFrame original con las nuevas columnas de embeddings normalizados.
|
353 |
"""
|
354 |
|
355 |
+
model = SentenceTransformer(model_name, device=embeddings_device)
|
356 |
sentences = data[text_column].tolist()
|
357 |
+
embeddings = model.encode(sentences, batch_size=batch_size, convert_to_numpy=True, device=device)
|
358 |
normalized_embeddings = normalize(embeddings, norm="l2")
|
359 |
+
data["embeddings"] = list(normalized_embeddings)
|
|
|
360 |
|
361 |
return data
|
362 |
|
|
|
861 |
"sentiment-analysis",
|
862 |
model="nlptown/bert-base-multilingual-uncased-sentiment",
|
863 |
truncation=True,
|
864 |
+
device=device
|
865 |
)
|
866 |
|
867 |
|
|
|
874 |
return "positivo"
|
875 |
|
876 |
|
877 |
+
def classify_sentiment_df(data, comment_col="comment", batch_size=32, num_threads=8):
|
878 |
+
comentarios = data[comment_col].tolist()
|
|
|
|
|
|
|
879 |
|
880 |
+
if device == 0: # Si hay GPU, procesar en batch
|
881 |
+
resultados = classifier(comentarios, batch_size=batch_size)
|
882 |
+
data["sentimiento"] = [map_sentiment(r["label"]) for r in resultados]
|
883 |
+
data["confianza"] = [r["score"] for r in resultados]
|
884 |
+
|
885 |
+
else: # Si no hay GPU, usar CPU con hilos
|
886 |
+
def classify_sentiment(texto):
|
887 |
+
resultado = classifier(texto)[0]
|
888 |
+
return map_sentiment(resultado["label"]), resultado["score"]
|
889 |
|
890 |
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
891 |
+
resultados = list(executor.map(classify_sentiment, comentarios))
|
|
|
892 |
|
893 |
+
sentimientos, confianzas = zip(*resultados)
|
894 |
+
data["sentimiento"] = sentimientos
|
895 |
+
data["confianza"] = confianzas
|
896 |
|
897 |
return data
|
898 |
|