File size: 3,976 Bytes
63f9eaa
4179233
63f9eaa
 
ca7b7fb
63f9eaa
4179233
 
63f9eaa
 
 
 
 
 
4179233
 
 
 
63f9eaa
 
 
 
 
 
 
4179233
 
63f9eaa
4179233
 
 
 
63f9eaa
 
 
7341cb3
 
 
4179233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7341cb3
4179233
 
7341cb3
4179233
 
 
 
 
 
 
 
 
 
c1385ec
 
4179233
 
 
 
 
 
 
 
 
 
 
 
 
3d454b3
 
 
4179233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63f9eaa
4179233
 
 
63f9eaa
45aca14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os

import pandas as pd
import plotly.io as pio
import clustering
from dotenv import load_dotenv
from flask import Flask, render_template, request
import logging

if os.getenv("HUGGINGFACE_HUB_CACHE") is None:
    load_dotenv()

api_key = os.getenv("youtube_api_key")

app = Flask(__name__)
app.logger.setLevel(logging.ERROR)
app.config["PROPAGATE_EXCEPTIONS"] = False

RANDOM_STATE = 333


def convert_graph_to_html(graph, full_html=False):
    return pio.to_html(graph, full_html=full_html) if graph else None


@app.route("/", methods=["GET", "POST"])
def index():
    video_details = None
    k_distance_graph = None
    scores_graph = None
    sankey_graph = None
    image_path = None
    sentiment_daily_graph = None
    sentiment_count = None

    current_directory = os.getcwd()
    print("Directorio de trabajo actual:", current_directory)

    if request.method == "POST":
        url = request.form["url"]
        if url:
            video_details = clustering.get_youtube_video_details(url, api_key)
            comments_df = clustering.get_youtube_comments(api_key, url)
            comments_df = clustering.add_normalized_embeddings_to_dataframe(
                comments_df, "comment"
            )

            comments_df["published_at"] = pd.to_datetime(
                comments_df["published_at"]
            ).dt.date

            comments_df = clustering.classify_sentiment_df(comments_df)
            comments_df.to_pickle(
                "./data/Comentarios-Youtube/comments_df.pkl"
            )
            comments_df = pd.read_pickle(
                "./data/Comentarios-Youtube/comments_df.pkl"
            )
            sentiment_count = comments_df["sentimiento"].value_counts().to_dict()
            sentiment_daily_graph = clustering.plot_sentiment_daily(comments_df)

            sentiment_daily_graph = convert_graph_to_html(sentiment_daily_graph)

            umap_df, min_eps, max_eps = clustering.transform_embeddings(
                comments_df, embeddings_col="embeddings"
            )

            image_path = os.path.join("static", "wordcloud.png")
            clustering.plot_wordcloud(comments_df, text_column="comment", output_filename=image_path)

            total = comments_df.shape[0]

            min_items_by_cluster = clustering.determine_min_items_by_cluster(total)

            (
                cluster_assignments,
                cluster_counts,
                calinski_harabasz_scores,
                silhouette_scores,
                most_similar_comments,
                umap_df,
            ) = clustering.perform_clustering(
                umap_df, min_eps, max_eps, n=10, 
                threshold_values=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
                embeddings_col="embeddings"
            )

            labels, source, target, values, comments = clustering.build_sankey_data(
                cluster_assignments,
                cluster_counts,
                most_similar_comments,
                min_items_by_cluster=min_items_by_cluster,
            )

            sankey_graph = clustering.plot_sankey(
                labels, source, target, values, comments, height=1000, width=1200
            )
            sankey_graph = convert_graph_to_html(sankey_graph)

            scores_graph, _ = clustering.plot_clustering_metric(
                silhouette_scores, calinski_harabasz_scores
            )
            scores_graph = convert_graph_to_html(scores_graph)

    return render_template(
        "index.html",
        video_details=video_details,
        k_distance_graph=k_distance_graph,
        sankey_graph=sankey_graph,
        scores_graph=scores_graph,
        wordcloud_path=image_path,
        sentiment_daily_graph=sentiment_daily_graph,
        sentiment_count=sentiment_count,
    )


#  gunicorn -b 0.0.0.0:5000 app_clustering.app:app
# http://172.20.0.2:5000/
# http://0.0.0.0:5000/
if __name__ == "__main__":
    app.run(host='0.0.0.0', port=7860)