File size: 4,480 Bytes
63f9eaa
4179233
63f9eaa
 
ca7b7fb
63f9eaa
4179233
 
63f9eaa
edb7d72
a8ccaf1
edb7d72
 
 
 
 
 
 
 
63f9eaa
 
 
 
 
4179233
 
 
 
63f9eaa
 
 
 
 
 
 
4179233
 
63f9eaa
4179233
 
 
 
63f9eaa
 
 
7341cb3
edb7d72
7341cb3
4179233
 
 
edb7d72
4179233
 
edb7d72
4179233
 
 
edb7d72
4179233
 
 
edb7d72
4179233
 
7341cb3
4179233
 
7341cb3
4179233
 
 
 
 
 
 
 
 
edb7d72
c1385ec
 
4179233
 
edb7d72
4179233
edb7d72
4179233
 
 
 
 
 
 
 
3d454b3
 
4179233
edb7d72
4179233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63f9eaa
4179233
 
 
63f9eaa
45aca14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os

import pandas as pd
import plotly.io as pio
import clustering
from dotenv import load_dotenv
from flask import Flask, render_template, request
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

def log_message(message):
    """"""
    logging.info(message)

if os.getenv("HUGGINGFACE_HUB_CACHE") is None:
    load_dotenv()

api_key = os.getenv("youtube_api_key")

app = Flask(__name__)
app.logger.setLevel(logging.ERROR)
app.config["PROPAGATE_EXCEPTIONS"] = False

RANDOM_STATE = 333


def convert_graph_to_html(graph, full_html=False):
    return pio.to_html(graph, full_html=full_html) if graph else None


@app.route("/", methods=["GET", "POST"])
def index():
    video_details = None
    k_distance_graph = None
    scores_graph = None
    sankey_graph = None
    image_path = None
    sentiment_daily_graph = None
    sentiment_count = None

    current_directory = os.getcwd()
    log_message("Iniciando procesamiento...")

    if request.method == "POST":
        url = request.form["url"]
        if url:
            log_message("Obteniendo datos de Youtube")
            video_details = clustering.get_youtube_video_details(url, api_key)
            comments_df = clustering.get_youtube_comments(api_key, url)
            log_message("Generando embeddings")
            comments_df = clustering.add_normalized_embeddings_to_dataframe(
                comments_df, "comment"
            )
            log_message("Procesamiento de los datos")
            comments_df["published_at"] = pd.to_datetime(
                comments_df["published_at"]
            ).dt.date
            log_message("Clasificaci贸n de los sentimientos")
            comments_df = clustering.classify_sentiment_df(comments_df)
            comments_df.to_pickle(
                "./data/Comentarios-Youtube/comments_df.pkl"
            )
            comments_df = pd.read_pickle(
                "./data/Comentarios-Youtube/comments_df.pkl"
            )
            sentiment_count = comments_df["sentimiento"].value_counts().to_dict()
            sentiment_daily_graph = clustering.plot_sentiment_daily(comments_df)

            sentiment_daily_graph = convert_graph_to_html(sentiment_daily_graph)

            umap_df, min_eps, max_eps = clustering.transform_embeddings(
                comments_df, embeddings_col="embeddings"
            )
            log_message("Generaci贸n de wordcloud")
            image_path = os.path.join("static", "wordcloud.png")
            clustering.plot_wordcloud(comments_df, text_column="comment", output_filename=image_path)

            total = comments_df.shape[0]
            
            min_items_by_cluster = clustering.determine_min_items_by_cluster(total)
            log_message("Modelado y generaci贸n de m茅tricas")
            (
                cluster_assignments,
                cluster_counts,
                calinski_harabasz_scores,
                silhouette_scores,
                most_similar_comments,
                umap_df,
            ) = clustering.perform_clustering(
                umap_df, min_eps, max_eps, n=10, 
                embeddings_col="embeddings"
            )
            log_message("Creaci贸n de gr谩fico de Sankey")
            labels, source, target, values, comments = clustering.build_sankey_data(
                cluster_assignments,
                cluster_counts,
                most_similar_comments,
                min_items_by_cluster=min_items_by_cluster,
            )

            sankey_graph = clustering.plot_sankey(
                labels, source, target, values, comments, height=1000, width=1200
            )
            sankey_graph = convert_graph_to_html(sankey_graph)

            scores_graph, _ = clustering.plot_clustering_metric(
                silhouette_scores, calinski_harabasz_scores
            )
            scores_graph = convert_graph_to_html(scores_graph)

    return render_template(
        "index.html",
        video_details=video_details,
        k_distance_graph=k_distance_graph,
        sankey_graph=sankey_graph,
        scores_graph=scores_graph,
        wordcloud_path=image_path,
        sentiment_daily_graph=sentiment_daily_graph,
        sentiment_count=sentiment_count,
    )


#  gunicorn -b 0.0.0.0:5000 app_clustering.app:app
# http://172.20.0.2:5000/
# http://0.0.0.0:5000/
if __name__ == "__main__":
    app.run(host='0.0.0.0', port=7860)