Spaces:
Running
Running
switching to plotly graphs
Browse files- .gitignore +2 -0
- app.py +106 -83
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ignore pycache
|
| 2 |
+
__pycache__/
|
app.py
CHANGED
|
@@ -4,6 +4,8 @@ from nltk.util import ngrams
|
|
| 4 |
from collections import Counter
|
| 5 |
import pandas as pd
|
| 6 |
import plotly.express as px
|
|
|
|
|
|
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
|
| 9 |
# Load the dataset and convert it to a Pandas dataframe
|
|
@@ -28,7 +30,7 @@ df["ari"] = df["no-contractions"].apply(
|
|
| 28 |
+ (0.5 * (len(x.split()) / len(x.split("."))))
|
| 29 |
- 21.43
|
| 30 |
)
|
| 31 |
-
|
| 32 |
written = df[df["categories"] == "Written"]
|
| 33 |
spoken = df[df["categories"] == "Spoken"]
|
| 34 |
|
|
@@ -39,115 +41,136 @@ with gr.Blocks() as demo:
|
|
| 39 |
# A Dashboard to Analyze the State of the Union Addresses
|
| 40 |
"""
|
| 41 |
)
|
| 42 |
-
|
| 43 |
df,
|
| 44 |
x="date",
|
| 45 |
y="word_count",
|
| 46 |
-
title="Total Number of Words in
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
)
|
|
|
|
| 49 |
# group by president and category and calculate the average word count sort by date
|
| 50 |
avg_word_count = (
|
| 51 |
-
df.groupby(["
|
| 52 |
)
|
| 53 |
-
|
| 54 |
-
gr.BarPlot(
|
| 55 |
avg_word_count,
|
| 56 |
x="potus",
|
| 57 |
y="word_count",
|
| 58 |
-
title="Average Number of Words in
|
| 59 |
color="categories",
|
| 60 |
-
|
| 61 |
-
height=400,
|
| 62 |
-
min_width=160,
|
| 63 |
-
fill_height=True,
|
| 64 |
-
container=True,
|
| 65 |
-
scale=2,
|
| 66 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
with gr.Row():
|
| 68 |
ari = df[["potus", "date", "ari", "categories"]]
|
| 69 |
-
|
| 70 |
ari,
|
| 71 |
x="date",
|
| 72 |
y="ari",
|
| 73 |
-
title="Automated Readability Index",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
)
|
|
|
|
| 75 |
# get all unique president names
|
| 76 |
presidents = df["potus"].unique()
|
| 77 |
# convert presidents to a list
|
| 78 |
presidents = presidents.tolist()
|
| 79 |
# create a dropdown to select a president
|
| 80 |
-
president = gr.Dropdown(label="Select a President", choices=
|
| 81 |
grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
|
| 82 |
-
with gr.Row():
|
| 83 |
-
# if president is not of type string
|
| 84 |
-
@gr.render(inputs=president)
|
| 85 |
-
def show_text(potus):
|
| 86 |
-
if potus != "All" and potus is not None:
|
| 87 |
-
ari = df[df["potus"] == potus][
|
| 88 |
-
["date", "categories", "word_count", "ari"]
|
| 89 |
-
]
|
| 90 |
-
gr.DataFrame(ari, height=200)
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
#
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
print(n_grams)
|
| 114 |
-
# create a Counter object from the trigrams
|
| 115 |
-
potus_df = df[df["potus"] == potus]
|
| 116 |
-
# decode the tokens-nostop column from a byte array to a list of string
|
| 117 |
-
trigrams = (
|
| 118 |
-
potus_df["tokens-nostop"]
|
| 119 |
-
.apply(lambda x: list(ngrams(x, n_grams)))
|
| 120 |
-
.apply(Counter)
|
| 121 |
-
.sum()
|
| 122 |
-
)
|
| 123 |
-
# get the most common trigrams
|
| 124 |
-
common_trigrams = trigrams.most_common(20)
|
| 125 |
-
# unzip the list of tuples and plot the trigrams and counts as a bar chart
|
| 126 |
-
trigrams, counts = zip(*common_trigrams)
|
| 127 |
-
# join the trigrams into a single string
|
| 128 |
-
trigrams = [" ".join(trigram) for trigram in trigrams]
|
| 129 |
-
# create a dataframe from the trigrams and counts
|
| 130 |
-
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
| 131 |
-
# plot the trigrams and counts as a bar chart from matplotlib
|
| 132 |
-
"""
|
| 133 |
-
fig, ax = plt.subplots(figsize=(12, 4))
|
| 134 |
-
ax.barh(trigrams_df["trigrams"], trigrams_df["counts"])
|
| 135 |
-
ax.set_title("Top 20 Trigrams")
|
| 136 |
-
ax.set_ylabel("Count")
|
| 137 |
-
ax.set_xlabel("Trigrams")
|
| 138 |
-
plt.xticks(rotation=45)
|
| 139 |
-
# make it tight layout
|
| 140 |
-
plt.tight_layout()
|
| 141 |
-
"""
|
| 142 |
-
fig = px.scatter(
|
| 143 |
-
trigrams_df,
|
| 144 |
-
x="counts",
|
| 145 |
-
y="trigrams",
|
| 146 |
-
title="Top 20 Trigrams",
|
| 147 |
-
orientation="h",
|
| 148 |
-
)
|
| 149 |
-
print(fig)
|
| 150 |
-
gr.Plot(value=fig, container=True, visible=True)
|
| 151 |
|
| 152 |
|
| 153 |
demo.launch(share=True)
|
|
|
|
| 4 |
from collections import Counter
|
| 5 |
import pandas as pd
|
| 6 |
import plotly.express as px
|
| 7 |
+
import plotly.graph_objects as go
|
| 8 |
+
from plotly.subplots import make_subplots
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
|
| 11 |
# Load the dataset and convert it to a Pandas dataframe
|
|
|
|
| 30 |
+ (0.5 * (len(x.split()) / len(x.split("."))))
|
| 31 |
- 21.43
|
| 32 |
)
|
| 33 |
+
df = df.sort_values(by="date")
|
| 34 |
written = df[df["categories"] == "Written"]
|
| 35 |
spoken = df[df["categories"] == "Spoken"]
|
| 36 |
|
|
|
|
| 41 |
# A Dashboard to Analyze the State of the Union Addresses
|
| 42 |
"""
|
| 43 |
)
|
| 44 |
+
fig1 = px.line(
|
| 45 |
df,
|
| 46 |
x="date",
|
| 47 |
y="word_count",
|
| 48 |
+
title="Total Number of Words in Addresses",
|
| 49 |
+
line_shape="spline",
|
| 50 |
+
)
|
| 51 |
+
fig1.update_layout(
|
| 52 |
+
xaxis=dict(title="Date of Address"),
|
| 53 |
+
yaxis=dict(title="Word Count"),
|
| 54 |
)
|
| 55 |
+
gr.Plot(fig1)
|
| 56 |
# group by president and category and calculate the average word count sort by date
|
| 57 |
avg_word_count = (
|
| 58 |
+
df.groupby(["potus", "categories"])["word_count"].mean().reset_index()
|
| 59 |
)
|
| 60 |
+
fig2 = px.bar(
|
|
|
|
| 61 |
avg_word_count,
|
| 62 |
x="potus",
|
| 63 |
y="word_count",
|
| 64 |
+
title="Average Number of Words in Addresses by President",
|
| 65 |
color="categories",
|
| 66 |
+
barmode="group",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
)
|
| 68 |
+
fig2.update_layout(
|
| 69 |
+
xaxis=dict(
|
| 70 |
+
title="President",
|
| 71 |
+
tickangle=-45, # Rotate labels 45 degrees counterclockwise
|
| 72 |
+
),
|
| 73 |
+
yaxis=dict(
|
| 74 |
+
title="Average Word Count",
|
| 75 |
+
tickangle=0, # Default label angle (horizontal)
|
| 76 |
+
),
|
| 77 |
+
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 78 |
+
)
|
| 79 |
+
gr.Plot(fig2)
|
| 80 |
with gr.Row():
|
| 81 |
ari = df[["potus", "date", "ari", "categories"]]
|
| 82 |
+
fig3 = px.line(
|
| 83 |
ari,
|
| 84 |
x="date",
|
| 85 |
y="ari",
|
| 86 |
+
title="Automated Readability Index in each Address",
|
| 87 |
+
line_shape="spline",
|
| 88 |
+
)
|
| 89 |
+
fig3.update_layout(
|
| 90 |
+
xaxis=dict(title="Date of Address"),
|
| 91 |
+
yaxis=dict(title="ARI Score"),
|
| 92 |
)
|
| 93 |
+
gr.Plot(fig3)
|
| 94 |
# get all unique president names
|
| 95 |
presidents = df["potus"].unique()
|
| 96 |
# convert presidents to a list
|
| 97 |
presidents = presidents.tolist()
|
| 98 |
# create a dropdown to select a president
|
| 99 |
+
president = gr.Dropdown(label="Select a President", choices=presidents)
|
| 100 |
grams = gr.Slider(minimum=1, maximum=4, step=1, label="N-grams", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
def plotly_bar(n_grams, potus):
|
| 103 |
+
if potus is not None:
|
| 104 |
+
# create a Counter object from the trigrams
|
| 105 |
+
potus_df = df[df["potus"] == potus]
|
| 106 |
+
# decode the tokens-nostop column from a byte array to a list of string
|
| 107 |
+
trigrams = (
|
| 108 |
+
potus_df["tokens-nostop"]
|
| 109 |
+
.apply(lambda x: list(ngrams(x, n_grams)))
|
| 110 |
+
.apply(Counter)
|
| 111 |
+
.sum()
|
| 112 |
+
)
|
| 113 |
+
# get the most common trigrams
|
| 114 |
+
common_trigrams = trigrams.most_common(10)
|
| 115 |
+
# unzip the list of tuples and plot the trigrams and counts as a bar chart
|
| 116 |
+
trigrams, counts = zip(*common_trigrams)
|
| 117 |
+
# join the trigrams into a single string
|
| 118 |
+
trigrams = [" ".join(trigram) for trigram in trigrams]
|
| 119 |
+
# create a dataframe from the trigrams and counts
|
| 120 |
+
trigrams_df = pd.DataFrame({"trigrams": trigrams, "counts": counts})
|
| 121 |
+
fig4 = px.bar(
|
| 122 |
+
trigrams_df,
|
| 123 |
+
x="counts",
|
| 124 |
+
y="trigrams",
|
| 125 |
+
title=f"Top {n_grams}-grams",
|
| 126 |
+
orientation="h",
|
| 127 |
+
height=400,
|
| 128 |
+
)
|
| 129 |
+
return fig4
|
| 130 |
|
| 131 |
+
if president != "All" and president is not None:
|
| 132 |
+
gr.Plot(plotly_bar, inputs=[grams, president])
|
| 133 |
+
|
| 134 |
+
def plotly_line(president):
|
| 135 |
+
if president != "All" and president is not None:
|
| 136 |
+
potus_df = df[df["potus"] == president]
|
| 137 |
+
fig5 = make_subplots(specs=[[{"secondary_y": True}]])
|
| 138 |
+
fig5.add_trace(
|
| 139 |
+
go.Scatter(
|
| 140 |
+
x=potus_df["date"],
|
| 141 |
+
y=potus_df["word_count"],
|
| 142 |
+
name="Word Count",
|
| 143 |
+
),
|
| 144 |
+
secondary_y=False,
|
| 145 |
+
)
|
| 146 |
+
fig5.add_trace(
|
| 147 |
+
go.Scatter(
|
| 148 |
+
x=potus_df["date"],
|
| 149 |
+
y=potus_df["ari"],
|
| 150 |
+
name="ARI",
|
| 151 |
+
),
|
| 152 |
+
secondary_y=True,
|
| 153 |
+
)
|
| 154 |
+
# Add figure title
|
| 155 |
+
fig5.update_layout(title_text="Double Y Axis Example")
|
| 156 |
+
|
| 157 |
+
# Set x-axis title
|
| 158 |
+
fig5.update_xaxes(title_text="xaxis title")
|
| 159 |
+
|
| 160 |
+
# Set y-axes titles
|
| 161 |
+
fig5.update_yaxes(
|
| 162 |
+
title_text="<b>primary</b> yaxis title", secondary_y=False
|
| 163 |
+
)
|
| 164 |
+
fig5.update_yaxes(
|
| 165 |
+
title_text="<b>secondary</b> yaxis title", secondary_y=True
|
| 166 |
+
)
|
| 167 |
+
return fig5
|
| 168 |
|
| 169 |
+
# calculate the total number of words in the speech_html column and add it to a new column
|
| 170 |
+
# if the president is "All", show the word count for all presidents
|
| 171 |
+
# if the president is not "All", show the word count for the selected president
|
| 172 |
+
if president != "All" and president is not None:
|
| 173 |
+
gr.Plot(plotly_line, inputs=[president])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
|
| 176 |
demo.launch(share=True)
|