Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copy of abc.py
|
2 |
+
# Import required packages
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import altair as alt
|
6 |
+
import streamlit as st
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
from scipy.stats import zscore
|
9 |
+
import streamlit as st
|
10 |
+
import altair as alt
|
11 |
+
from sklearn.cluster import KMeans
|
12 |
+
from sklearn.preprocessing import MinMaxScaler
|
13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
14 |
+
|
15 |
+
df_new= pd.read_csv('Final_data.csv')
|
16 |
+
# Page title
|
17 |
+
st.title("Exploratory Data Analysis on Kiva Loans")
|
18 |
+
st.sidebar.header("Filters")
|
19 |
+
|
20 |
+
# Filter for Country
|
21 |
+
country = df_new['country'].unique()
|
22 |
+
selected_country = st.sidebar.selectbox("Select Country", country.tolist())
|
23 |
+
if selected_country:
|
24 |
+
filtered_df = df_new[df_new['country'] == selected_country]
|
25 |
+
else:
|
26 |
+
st.warning("Please select a country from the sidebar")
|
27 |
+
st.stop()
|
28 |
+
|
29 |
+
# Filter for Gender
|
30 |
+
borrower_genders = df_new['borrower_genders'].unique()
|
31 |
+
selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist())
|
32 |
+
filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)]
|
33 |
+
|
34 |
+
# Filter for Loan Amount
|
35 |
+
min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max())
|
36 |
+
selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan))
|
37 |
+
filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])]
|
38 |
+
|
39 |
+
# Filter for Years
|
40 |
+
filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
|
41 |
+
years = sorted(filtered_df['year'].unique())
|
42 |
+
selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years)
|
43 |
+
filtered_df = filtered_df[filtered_df['year'].isin(selected_years)]
|
44 |
+
|
45 |
+
# selected filters
|
46 |
+
st.caption(f"Data for Country: {selected_country} | Gender: {', '.join(selected_genders)} | Loan Amount: {selected_loan_amount} | Years: {', '.join(map(str, selected_years))}")
|
47 |
+
|
48 |
+
|
49 |
+
# Distribution of Loan Sector
|
50 |
+
st.subheader('Distribution of Loan Sector')
|
51 |
+
sector_chart = alt.Chart(filtered_df).mark_bar().encode(
|
52 |
+
x=alt.X('count(sector):Q', title='Count'),
|
53 |
+
y=alt.Y('sector:N', sort='-x', title='Sector'),
|
54 |
+
color=alt.Color('sector:N', legend=None)
|
55 |
+
).properties(
|
56 |
+
width=600,
|
57 |
+
height=400
|
58 |
+
)
|
59 |
+
st.altair_chart(sector_chart)
|
60 |
+
|
61 |
+
# Distribution of Loan Term
|
62 |
+
st.subheader('Distribution of Loan Term (in Months)')
|
63 |
+
term_hist = alt.Chart(filtered_df).mark_bar().encode(
|
64 |
+
x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'),
|
65 |
+
y=alt.Y('count():Q', title='Frequency'),
|
66 |
+
color=alt.Color('term_in_months:Q', legend=None)
|
67 |
+
).properties(
|
68 |
+
width=600,
|
69 |
+
height=400
|
70 |
+
)
|
71 |
+
st.altair_chart(term_hist)
|
72 |
+
|
73 |
+
# Monthly Loan Amounts Over Time
|
74 |
+
st.subheader('Monthly Loan Amounts Over Time')
|
75 |
+
filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month
|
76 |
+
filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b')
|
77 |
+
filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
|
78 |
+
|
79 |
+
monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index()
|
80 |
+
|
81 |
+
loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode(
|
82 |
+
x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'),
|
83 |
+
y=alt.Y('loan_amount:Q', title='Total Loan Amount'),
|
84 |
+
color=alt.Color('year:N', title='Year'),
|
85 |
+
tooltip=['year', 'month_name', 'loan_amount']
|
86 |
+
).properties(
|
87 |
+
width=700,
|
88 |
+
height=400
|
89 |
+
)
|
90 |
+
|
91 |
+
st.altair_chart(loan_time_series)
|
92 |
+
|
93 |
+
# Top 10 Countries with Highest Average Loan Amount
|
94 |
+
st.subheader('Top 10 Countries with Highest Average Loan Amount')
|
95 |
+
df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')]
|
96 |
+
df_clean['country'] = df_clean['country'].str.strip()
|
97 |
+
top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index()
|
98 |
+
|
99 |
+
top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode(
|
100 |
+
x=alt.X('loan_amount:Q', title='Average Loan Amount'),
|
101 |
+
y=alt.Y('country:N', sort='-x', title='Country'),
|
102 |
+
color=alt.Color('country:N', legend=None)
|
103 |
+
).properties(
|
104 |
+
width=600,
|
105 |
+
height=400
|
106 |
+
)
|
107 |
+
st.altair_chart(top_10_chart)
|
108 |
+
|
109 |
+
# Distribution of Genders
|
110 |
+
st.subheader('Distribution of Borrower Genders')
|
111 |
+
gender_counts = filtered_df['borrower_genders'].value_counts().reset_index()
|
112 |
+
gender_counts.columns = ['borrower_genders', 'count']
|
113 |
+
|
114 |
+
gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode(
|
115 |
+
theta=alt.Theta(field="count", type="quantitative"),
|
116 |
+
color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"),
|
117 |
+
tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")]
|
118 |
+
).properties(
|
119 |
+
width=400,
|
120 |
+
height=400
|
121 |
+
)
|
122 |
+
|
123 |
+
# text labels to the doughnut chart
|
124 |
+
gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode(
|
125 |
+
text=alt.Text('count:Q', format='.0f')
|
126 |
+
)
|
127 |
+
|
128 |
+
final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend(
|
129 |
+
labelFontSize=12,
|
130 |
+
titleFontSize=14
|
131 |
+
)
|
132 |
+
st.altair_chart(final_chart)
|
133 |
+
|
134 |
+
|
135 |
+
# Dataset Summary
|
136 |
+
st.header('Dataset Summary')
|
137 |
+
st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2)))
|
138 |
+
st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2)))
|
139 |
+
st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0]))
|
140 |
+
st.write(filtered_df.describe())
|
141 |
+
|
142 |
+
# Filtered dataframe
|
143 |
+
st.header("Filtered Data")
|
144 |
+
st.dataframe(filtered_df)
|
145 |
+
|
146 |
+
|
147 |
+
st.header('K-Means Clustering')
|
148 |
+
|
149 |
+
#the columns we want to do kmean to
|
150 |
+
filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']]
|
151 |
+
|
152 |
+
#to determine scaler
|
153 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
154 |
+
filtered_df_reduced.hist(bins=100, ax=ax)
|
155 |
+
st.pyplot(fig)
|
156 |
+
|
157 |
+
#my chosen scaler
|
158 |
+
scaler = MinMaxScaler()
|
159 |
+
|
160 |
+
data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced)
|
161 |
+
|
162 |
+
Sum_of_squared_distances = []
|
163 |
+
|
164 |
+
K = range(1, 10)
|
165 |
+
|
166 |
+
for k in K:
|
167 |
+
km = KMeans(n_clusters=k, n_init = "auto")
|
168 |
+
km.fit(data_to_cluster_scaled)
|
169 |
+
Sum_of_squared_distances.append(km.inertia_)
|
170 |
+
|
171 |
+
fig, ax = plt.subplots()
|
172 |
+
ax.plot(K, Sum_of_squared_distances, 'bx-')
|
173 |
+
ax.set_xlabel('Number of Clusters (k)')
|
174 |
+
ax.set_ylabel('Sum of Squared Distances')
|
175 |
+
ax.set_title('Elbow Method For Optimal k')
|
176 |
+
ax.grid(True)
|
177 |
+
|
178 |
+
st.pyplot(fig)
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
def k_means_simple(data, k, max_iters=100):
|
183 |
+
centroids = data[np.random.choice(data.shape[0], k, replace=False)]
|
184 |
+
|
185 |
+
for _ in range(max_iters):
|
186 |
+
distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2)
|
187 |
+
labels = np.argmin(distances, axis=0)
|
188 |
+
|
189 |
+
new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])
|
190 |
+
|
191 |
+
if np.all(centroids == new_centroids):
|
192 |
+
break
|
193 |
+
|
194 |
+
centroids = new_centroids
|
195 |
+
|
196 |
+
return labels, centroids
|
197 |
+
|
198 |
+
labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5)
|
199 |
+
|
200 |
+
|
201 |
+
distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2)
|
202 |
+
nearest_centroid_indices = np.argmin(distances, axis=1)
|
203 |
+
|
204 |
+
data_df = pd.DataFrame({
|
205 |
+
'x': data_to_cluster_scaled[:, 0],
|
206 |
+
'y': data_to_cluster_scaled[:, 1],
|
207 |
+
'centroid': nearest_centroid_indices
|
208 |
+
})
|
209 |
+
|
210 |
+
|
211 |
+
centroids_df = pd.DataFrame({
|
212 |
+
'x': final_centroids[:, 0],
|
213 |
+
'y': final_centroids[:, 1],
|
214 |
+
'centroid': range(final_centroids.shape[0])
|
215 |
+
})
|
216 |
+
|
217 |
+
|
218 |
+
data_df['type'] = 'data'
|
219 |
+
centroids_df['type'] = 'centroid'
|
220 |
+
|
221 |
+
data_df['loan_amount'] = filtered_df['loan_amount'].values
|
222 |
+
data_df['term_in_months'] = filtered_df['term_in_months'].values
|
223 |
+
data_df['activity'] = filtered_df['activity'].values
|
224 |
+
data_df['sector'] = filtered_df['sector'].values
|
225 |
+
data_df['region'] = filtered_df['region'].values
|
226 |
+
|
227 |
+
combined_df = pd.concat([data_df, centroids_df])
|
228 |
+
|
229 |
+
scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode(
|
230 |
+
x='x',
|
231 |
+
y='y',
|
232 |
+
color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')),
|
233 |
+
opacity=alt.condition(
|
234 |
+
alt.datum.type == 'data',
|
235 |
+
alt.value(0.6),
|
236 |
+
alt.value(1)
|
237 |
+
),
|
238 |
+
tooltip=[
|
239 |
+
alt.Tooltip('loan_amount:Q', title='Loan Amount'),
|
240 |
+
alt.Tooltip('term_in_months:Q', title='Term (Months)'),
|
241 |
+
alt.Tooltip('activity:N', title='Activity'),
|
242 |
+
alt.Tooltip('sector:N', title='Sector'),
|
243 |
+
alt.Tooltip('region:N', title='Region')
|
244 |
+
]
|
245 |
+
).properties(
|
246 |
+
title='Reduced Data and Initial Centroids'
|
247 |
+
)
|
248 |
+
|
249 |
+
st.altair_chart(scatter_plot, use_container_width=True)
|
250 |
+
|
251 |
+
# Fix session states
|
252 |
+
if 'country_selected' not in st.session_state:
|
253 |
+
st.session_state['country_selected'] = None
|
254 |
+
if 'gender_selected' not in st.session_state:
|
255 |
+
st.session_state['gender_selected'] = None
|
256 |
+
if 'sector_selected' not in st.session_state:
|
257 |
+
st.session_state['sector_selected'] = None
|
258 |
+
|
259 |
+
# Recommendation Engine based on Country, Gender, and Sector
|
260 |
+
st.subheader("Loan Recommendation")
|
261 |
+
|
262 |
+
# Input for country
|
263 |
+
country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique())))
|
264 |
+
if country_input != "None":
|
265 |
+
# Filter gender options based on selected country
|
266 |
+
filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique()
|
267 |
+
gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders))
|
268 |
+
else:
|
269 |
+
gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique()))
|
270 |
+
|
271 |
+
# Input for sector based on the selected country and gender
|
272 |
+
if country_input != "None" and gender_input != "None":
|
273 |
+
# Filter sector options based on selected country and gender
|
274 |
+
filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique()
|
275 |
+
sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors))
|
276 |
+
else:
|
277 |
+
sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique()))
|
278 |
+
|
279 |
+
# Generate recommendations based on country, gender, and sector
|
280 |
+
if country_input != "None" and gender_input != "None" and sector_input != "None":
|
281 |
+
# Filter the DataFrame based on selected country, gender, and sector
|
282 |
+
user_filtered_df = df_new[
|
283 |
+
(df_new['country'] == country_input) &
|
284 |
+
(df_new['borrower_genders'] == gender_input) &
|
285 |
+
(df_new['sector'] == sector_input)
|
286 |
+
].reset_index(drop=True)
|
287 |
+
|
288 |
+
if not user_filtered_df.empty:
|
289 |
+
# Align the filtered DataFrame's indices with the scaled data by resetting both
|
290 |
+
data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)]
|
291 |
+
|
292 |
+
# Compute similarity matrix for filtered data
|
293 |
+
similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered)
|
294 |
+
|
295 |
+
# Get the most similar loans (top 3)
|
296 |
+
similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4]
|
297 |
+
|
298 |
+
# Display recommended loans
|
299 |
+
recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']]
|
300 |
+
st.write("Recommended Loans:")
|
301 |
+
st.dataframe(recommendations)
|
302 |
+
else:
|
303 |
+
st.write("No matching loans found for the selected country, gender, and sector.")
|
304 |
+
else:
|
305 |
+
st.write("Please select a country, gender, and sector.")
|