Spaces:

jsulz
/

sotu-analysis

Running

App Files Files Community

jsulz commited on Aug 29, 2024

Commit

256f7c8

1 Parent(s): f239c56

switching to plotly graphs

Browse files

Files changed (2) hide show

.gitignore +2 -0
app.py +106 -83

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # ignore pycache
2	+ __pycache__/

app.py CHANGED Viewed

@@ -4,6 +4,8 @@ from nltk.util import ngrams
 from collections import Counter
 import pandas as pd
 import plotly.express as px
 import matplotlib.pyplot as plt
 # Load the dataset and convert it to a Pandas dataframe
@@ -28,7 +30,7 @@ df["ari"] = df["no-contractions"].apply(
     + (0.5 * (len(x.split()) / len(x.split("."))))
     - 21.43
 )
 written = df[df["categories"] == "Written"]
 spoken = df[df["categories"] == "Spoken"]
@@ -39,115 +41,136 @@ with gr.Blocks() as demo:
         # A Dashboard to Analyze the State of the Union Addresses
         """
     )
-    gr.BarPlot(
         df,
         x="date",
         y="word_count",
-        title="Total Number of Words in the Speeches",
-        color="categories",
     )
     # group by president and category and calculate the average word count sort by date
     avg_word_count = (
-        df.groupby(["date", "potus", "categories"])["word_count"].mean().reset_index()
     )
-    # create a bar chart
-    gr.BarPlot(
         avg_word_count,
         x="potus",
         y="word_count",
-        title="Average Number of Words in the Speeches",
         color="categories",
-        x_label_angle=-45,
-        height=400,
-        min_width=160,
-        fill_height=True,
-        container=True,
-        scale=2,
     )
     with gr.Row():
         ari = df[["potus", "date", "ari", "categories"]]
-        gr.LinePlot(
             ari,
             x="date",
             y="ari",
-            title="Automated Readability Index",
         )
     # get all unique president names
     presidents = df["potus"].unique()
     # convert presidents to a list
     presidents = presidents.tolist()
     # create a dropdown to select a president
-    president = gr.Dropdown(label="Select a President", choices=["All"] + presidents)
     grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
-    with gr.Row():
-        # if president is not of type string
-        @gr.render(inputs=president)
-        def show_text(potus):
-            if potus != "All" and potus is not None:
-                ari = df[df["potus"] == potus][
-                    ["date", "categories", "word_count", "ari"]
-                ]
-                gr.DataFrame(ari, height=200)
-        @gr.render(inputs=president)
-        def word_length_bar(potus):
-            # calculate the total number of words in the speech_html column and add it to a new column
-            # if the president is "All", show the word count for all presidents
-            # if the president is not "All", show the word count for the selected president
-            if potus != "All" and potus is not None:
-                gr.LinePlot(
-                    df[df["potus"] == potus],
-                    x="date",
-                    y="word_count",
-                    title="Total Number of Words in the Speeches",
-                )
-    with gr.Row():
-        with gr.Column():
-            @gr.render(inputs=[president, grams])
-            def ngram_bar(potus, n_grams):
-                if potus != "All" and potus is not None:
-                    if type(n_grams) is not int:
-                        n_grams = 1
-                    print(n_grams)
-                    # create a Counter object from the trigrams
-                    potus_df = df[df["potus"] == potus]
-                    # decode the tokens-nostop column from a byte array to a list of string
-                    trigrams = (
-                        potus_df["tokens-nostop"]
-                        .apply(lambda x: list(ngrams(x, n_grams)))
-                        .apply(Counter)
-                        .sum()
-                    )
-                    # get the most common trigrams
-                    common_trigrams = trigrams.most_common(20)
-                    # unzip the list of tuples and plot the trigrams and counts as a bar chart
-                    trigrams, counts = zip(*common_trigrams)
-                    # join the trigrams into a single string
-                    trigrams = [" ".join(trigram) for trigram in trigrams]
-                    # create a dataframe from the trigrams and counts
-                    trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
-                    # plot the trigrams and counts as a bar chart from matplotlib
-                    """
-                    fig, ax = plt.subplots(figsize=(12, 4))
-                    ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
-                    ax.set_title("Top 20 Trigrams")
-                    ax.set_ylabel("Count")
-                    ax.set_xlabel("Trigrams")
-                    plt.xticks(rotation=45)
-                    # make it tight layout
-                    plt.tight_layout()
-                    """
-                    fig = px.scatter(
-                        trigrams_df,
-                        x="counts",
-                        y="trigrams",
-                        title="Top 20 Trigrams",
-                        orientation="h",
-                    )
-                    print(fig)
-                    gr.Plot(value=fig, container=True, visible=True)
 demo.launch(share=True)

 from collections import Counter
 import pandas as pd
 import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
 import matplotlib.pyplot as plt
 # Load the dataset and convert it to a Pandas dataframe
     + (0.5 * (len(x.split()) / len(x.split("."))))
     - 21.43
 )
+df = df.sort_values(by="date")
 written = df[df["categories"] == "Written"]
 spoken = df[df["categories"] == "Spoken"]
         # A Dashboard to Analyze the State of the Union Addresses
         """
     )
+    fig1 = px.line(
         df,
         x="date",
         y="word_count",
+        title="Total Number of Words in Addresses",
+        line_shape="spline",
+    )
+    fig1.update_layout(
+        xaxis=dict(title="Date of Address"),
+        yaxis=dict(title="Word Count"),
     )
+    gr.Plot(fig1)
     # group by president and category and calculate the average word count sort by date
     avg_word_count = (
+        df.groupby(["potus", "categories"])["word_count"].mean().reset_index()
     )
+    fig2 = px.bar(
         avg_word_count,
         x="potus",
         y="word_count",
+        title="Average Number of Words in Addresses by President",
         color="categories",
+        barmode="group",
     )
+    fig2.update_layout(
+        xaxis=dict(
+            title="President",
+            tickangle=-45,  # Rotate labels 45 degrees counterclockwise
+        ),
+        yaxis=dict(
+            title="Average Word Count",
+            tickangle=0,  # Default label angle (horizontal)
+        ),
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+    )
+    gr.Plot(fig2)
     with gr.Row():
         ari = df[["potus", "date", "ari", "categories"]]
+        fig3 = px.line(
             ari,
             x="date",
             y="ari",
+            title="Automated Readability Index in each Address",
+            line_shape="spline",
+        )
+        fig3.update_layout(
+            xaxis=dict(title="Date of Address"),
+            yaxis=dict(title="ARI Score"),
         )
+        gr.Plot(fig3)
     # get all unique president names
     presidents = df["potus"].unique()
     # convert presidents to a list
     presidents = presidents.tolist()
     # create a dropdown to select a president
+    president = gr.Dropdown(label="Select a President", choices=presidents)
     grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
+    def plotly_bar(n_grams, potus):
+        if potus is not None:
+            # create a Counter object from the trigrams
+            potus_df = df[df["potus"] == potus]
+            # decode the tokens-nostop column from a byte array to a list of string
+            trigrams = (
+                potus_df["tokens-nostop"]
+                .apply(lambda x: list(ngrams(x, n_grams)))
+                .apply(Counter)
+                .sum()
+            )
+            # get the most common trigrams
+            common_trigrams = trigrams.most_common(10)
+            # unzip the list of tuples and plot the trigrams and counts as a bar chart
+            trigrams, counts = zip(*common_trigrams)
+            # join the trigrams into a single string
+            trigrams = [" ".join(trigram) for trigram in trigrams]
+            # create a dataframe from the trigrams and counts
+            trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
+            fig4 = px.bar(
+                trigrams_df,
+                x="counts",
+                y="trigrams",
+                title=f"Top {n_grams}-grams",
+                orientation="h",
+                height=400,
+            )
+            return fig4
+    if president != "All" and president is not None:
+        gr.Plot(plotly_bar, inputs=[grams, president])
+    def plotly_line(president):
+        if president != "All" and president is not None:
+            potus_df = df[df["potus"] == president]
+            fig5 = make_subplots(specs=[[{"secondary_y": True}]])
+            fig5.add_trace(
+                go.Scatter(
+                    x=potus_df["date"],
+                    y=potus_df["word_count"],
+                    name="Word Count",
+                ),
+                secondary_y=False,
+            )
+            fig5.add_trace(
+                go.Scatter(
+                    x=potus_df["date"],
+                    y=potus_df["ari"],
+                    name="ARI",
+                ),
+                secondary_y=True,
+            )
+            # Add figure title
+            fig5.update_layout(title_text="Double Y Axis Example")
+            # Set x-axis title
+            fig5.update_xaxes(title_text="xaxis title")
+            # Set y-axes titles
+            fig5.update_yaxes(
+                title_text="<b>primary</b> yaxis title", secondary_y=False
+            )
+            fig5.update_yaxes(
+                title_text="<b>secondary</b> yaxis title", secondary_y=True
+            )
+            return fig5
+    # calculate the total number of words in the speech_html column and add it to a new column
+    # if the president is "All", show the word count for all presidents
+    # if the president is not "All", show the word count for the selected president
+    if president != "All" and president is not None:
+        gr.Plot(plotly_line, inputs=[president])
 demo.launch(share=True)