import os import pandas as pd import plotly.io as pio import clustering from dotenv import load_dotenv from flask import Flask, render_template, request import logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) def log_message(message): """""" logging.info(message) if os.getenv("HUGGINGFACE_HUB_CACHE") is None: load_dotenv() api_key = os.getenv("youtube_api_key") app = Flask(__name__) app.logger.setLevel(logging.ERROR) app.config["PROPAGATE_EXCEPTIONS"] = False RANDOM_STATE = 333 def convert_graph_to_html(graph, full_html=False): return pio.to_html(graph, full_html=full_html) if graph else None @app.route("/", methods=["GET", "POST"]) def index(): video_details = None k_distance_graph = None scores_graph = None sankey_graph = None image_path = None sentiment_daily_graph = None sentiment_count = None current_directory = os.getcwd() log_message("Iniciando procesamiento...") if request.method == "POST": url = request.form["url"] if url: log_message("Obteniendo datos de Youtube") video_details = clustering.get_youtube_video_details(url, api_key) comments_df = clustering.get_youtube_comments(api_key, url) log_message("Generando embeddings") comments_df = clustering.add_normalized_embeddings_to_dataframe( comments_df, "comment" ) log_message("Procesamiento de los datos") comments_df["published_at"] = pd.to_datetime( comments_df["published_at"] ).dt.date log_message("Clasificación de los sentimientos") comments_df = clustering.classify_sentiment_df(comments_df) comments_df.to_pickle( "./data/Comentarios-Youtube/comments_df.pkl" ) comments_df = pd.read_pickle( "./data/Comentarios-Youtube/comments_df.pkl" ) sentiment_count = comments_df["sentimiento"].value_counts().to_dict() sentiment_daily_graph = clustering.plot_sentiment_daily(comments_df) sentiment_daily_graph = convert_graph_to_html(sentiment_daily_graph) umap_df, min_eps, max_eps = clustering.transform_embeddings( comments_df, embeddings_col="embeddings" ) log_message("Generación de wordcloud") image_path = os.path.join("static", "wordcloud.png") clustering.plot_wordcloud(comments_df, text_column="comment", output_filename=image_path) total = comments_df.shape[0] min_items_by_cluster = clustering.determine_min_items_by_cluster(total) log_message("Modelado y generación de métricas") ( cluster_assignments, cluster_counts, calinski_harabasz_scores, silhouette_scores, most_similar_comments, umap_df, ) = clustering.perform_clustering( umap_df, min_eps, max_eps, n=10, embeddings_col="embeddings" ) log_message("Creación de gráfico de Sankey") labels, source, target, values, comments = clustering.build_sankey_data( cluster_assignments, cluster_counts, most_similar_comments, min_items_by_cluster=min_items_by_cluster, ) sankey_graph = clustering.plot_sankey( labels, source, target, values, comments, height=1000, width=1200 ) sankey_graph = convert_graph_to_html(sankey_graph) scores_graph, _ = clustering.plot_clustering_metric( silhouette_scores, calinski_harabasz_scores ) scores_graph = convert_graph_to_html(scores_graph) return render_template( "index.html", video_details=video_details, k_distance_graph=k_distance_graph, sankey_graph=sankey_graph, scores_graph=scores_graph, wordcloud_path=image_path, sentiment_daily_graph=sentiment_daily_graph, sentiment_count=sentiment_count, ) # gunicorn -b 0.0.0.0:5000 app_clustering.app:app # http://172.20.0.2:5000/ # http://0.0.0.0:5000/ if __name__ == "__main__": app.run(host='0.0.0.0', port=7860)