sufianahmad513 commited on
Commit
1f70e3d
·
verified ·
1 Parent(s): 4e7d6cc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +305 -0
app.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copy of abc.py
2
+ # Import required packages
3
+ import pandas as pd
4
+ import numpy as np
5
+ import altair as alt
6
+ import streamlit as st
7
+ import matplotlib.pyplot as plt
8
+ from scipy.stats import zscore
9
+ import streamlit as st
10
+ import altair as alt
11
+ from sklearn.cluster import KMeans
12
+ from sklearn.preprocessing import MinMaxScaler
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+
15
+ df_new= pd.read_csv('Final_data.csv')
16
+ # Page title
17
+ st.title("Exploratory Data Analysis on Kiva Loans")
18
+ st.sidebar.header("Filters")
19
+
20
+ # Filter for Country
21
+ country = df_new['country'].unique()
22
+ selected_country = st.sidebar.selectbox("Select Country", country.tolist())
23
+ if selected_country:
24
+ filtered_df = df_new[df_new['country'] == selected_country]
25
+ else:
26
+ st.warning("Please select a country from the sidebar")
27
+ st.stop()
28
+
29
+ # Filter for Gender
30
+ borrower_genders = df_new['borrower_genders'].unique()
31
+ selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist())
32
+ filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)]
33
+
34
+ # Filter for Loan Amount
35
+ min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max())
36
+ selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan))
37
+ filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])]
38
+
39
+ # Filter for Years
40
+ filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
41
+ years = sorted(filtered_df['year'].unique())
42
+ selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years)
43
+ filtered_df = filtered_df[filtered_df['year'].isin(selected_years)]
44
+
45
+ # selected filters
46
+ st.caption(f"Data for Country: {selected_country} | Gender: {', '.join(selected_genders)} | Loan Amount: {selected_loan_amount} | Years: {', '.join(map(str, selected_years))}")
47
+
48
+
49
+ # Distribution of Loan Sector
50
+ st.subheader('Distribution of Loan Sector')
51
+ sector_chart = alt.Chart(filtered_df).mark_bar().encode(
52
+ x=alt.X('count(sector):Q', title='Count'),
53
+ y=alt.Y('sector:N', sort='-x', title='Sector'),
54
+ color=alt.Color('sector:N', legend=None)
55
+ ).properties(
56
+ width=600,
57
+ height=400
58
+ )
59
+ st.altair_chart(sector_chart)
60
+
61
+ # Distribution of Loan Term
62
+ st.subheader('Distribution of Loan Term (in Months)')
63
+ term_hist = alt.Chart(filtered_df).mark_bar().encode(
64
+ x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'),
65
+ y=alt.Y('count():Q', title='Frequency'),
66
+ color=alt.Color('term_in_months:Q', legend=None)
67
+ ).properties(
68
+ width=600,
69
+ height=400
70
+ )
71
+ st.altair_chart(term_hist)
72
+
73
+ # Monthly Loan Amounts Over Time
74
+ st.subheader('Monthly Loan Amounts Over Time')
75
+ filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month
76
+ filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b')
77
+ filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
78
+
79
+ monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index()
80
+
81
+ loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode(
82
+ x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'),
83
+ y=alt.Y('loan_amount:Q', title='Total Loan Amount'),
84
+ color=alt.Color('year:N', title='Year'),
85
+ tooltip=['year', 'month_name', 'loan_amount']
86
+ ).properties(
87
+ width=700,
88
+ height=400
89
+ )
90
+
91
+ st.altair_chart(loan_time_series)
92
+
93
+ # Top 10 Countries with Highest Average Loan Amount
94
+ st.subheader('Top 10 Countries with Highest Average Loan Amount')
95
+ df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')]
96
+ df_clean['country'] = df_clean['country'].str.strip()
97
+ top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index()
98
+
99
+ top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode(
100
+ x=alt.X('loan_amount:Q', title='Average Loan Amount'),
101
+ y=alt.Y('country:N', sort='-x', title='Country'),
102
+ color=alt.Color('country:N', legend=None)
103
+ ).properties(
104
+ width=600,
105
+ height=400
106
+ )
107
+ st.altair_chart(top_10_chart)
108
+
109
+ # Distribution of Genders
110
+ st.subheader('Distribution of Borrower Genders')
111
+ gender_counts = filtered_df['borrower_genders'].value_counts().reset_index()
112
+ gender_counts.columns = ['borrower_genders', 'count']
113
+
114
+ gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode(
115
+ theta=alt.Theta(field="count", type="quantitative"),
116
+ color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"),
117
+ tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")]
118
+ ).properties(
119
+ width=400,
120
+ height=400
121
+ )
122
+
123
+ # text labels to the doughnut chart
124
+ gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode(
125
+ text=alt.Text('count:Q', format='.0f')
126
+ )
127
+
128
+ final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend(
129
+ labelFontSize=12,
130
+ titleFontSize=14
131
+ )
132
+ st.altair_chart(final_chart)
133
+
134
+
135
+ # Dataset Summary
136
+ st.header('Dataset Summary')
137
+ st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2)))
138
+ st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2)))
139
+ st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0]))
140
+ st.write(filtered_df.describe())
141
+
142
+ # Filtered dataframe
143
+ st.header("Filtered Data")
144
+ st.dataframe(filtered_df)
145
+
146
+
147
+ st.header('K-Means Clustering')
148
+
149
+ #the columns we want to do kmean to
150
+ filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']]
151
+
152
+ #to determine scaler
153
+ fig, ax = plt.subplots(figsize=(10, 5))
154
+ filtered_df_reduced.hist(bins=100, ax=ax)
155
+ st.pyplot(fig)
156
+
157
+ #my chosen scaler
158
+ scaler = MinMaxScaler()
159
+
160
+ data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced)
161
+
162
+ Sum_of_squared_distances = []
163
+
164
+ K = range(1, 10)
165
+
166
+ for k in K:
167
+ km = KMeans(n_clusters=k, n_init = "auto")
168
+ km.fit(data_to_cluster_scaled)
169
+ Sum_of_squared_distances.append(km.inertia_)
170
+
171
+ fig, ax = plt.subplots()
172
+ ax.plot(K, Sum_of_squared_distances, 'bx-')
173
+ ax.set_xlabel('Number of Clusters (k)')
174
+ ax.set_ylabel('Sum of Squared Distances')
175
+ ax.set_title('Elbow Method For Optimal k')
176
+ ax.grid(True)
177
+
178
+ st.pyplot(fig)
179
+
180
+
181
+
182
+ def k_means_simple(data, k, max_iters=100):
183
+ centroids = data[np.random.choice(data.shape[0], k, replace=False)]
184
+
185
+ for _ in range(max_iters):
186
+ distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2)
187
+ labels = np.argmin(distances, axis=0)
188
+
189
+ new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])
190
+
191
+ if np.all(centroids == new_centroids):
192
+ break
193
+
194
+ centroids = new_centroids
195
+
196
+ return labels, centroids
197
+
198
+ labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5)
199
+
200
+
201
+ distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2)
202
+ nearest_centroid_indices = np.argmin(distances, axis=1)
203
+
204
+ data_df = pd.DataFrame({
205
+ 'x': data_to_cluster_scaled[:, 0],
206
+ 'y': data_to_cluster_scaled[:, 1],
207
+ 'centroid': nearest_centroid_indices
208
+ })
209
+
210
+
211
+ centroids_df = pd.DataFrame({
212
+ 'x': final_centroids[:, 0],
213
+ 'y': final_centroids[:, 1],
214
+ 'centroid': range(final_centroids.shape[0])
215
+ })
216
+
217
+
218
+ data_df['type'] = 'data'
219
+ centroids_df['type'] = 'centroid'
220
+
221
+ data_df['loan_amount'] = filtered_df['loan_amount'].values
222
+ data_df['term_in_months'] = filtered_df['term_in_months'].values
223
+ data_df['activity'] = filtered_df['activity'].values
224
+ data_df['sector'] = filtered_df['sector'].values
225
+ data_df['region'] = filtered_df['region'].values
226
+
227
+ combined_df = pd.concat([data_df, centroids_df])
228
+
229
+ scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode(
230
+ x='x',
231
+ y='y',
232
+ color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')),
233
+ opacity=alt.condition(
234
+ alt.datum.type == 'data',
235
+ alt.value(0.6),
236
+ alt.value(1)
237
+ ),
238
+ tooltip=[
239
+ alt.Tooltip('loan_amount:Q', title='Loan Amount'),
240
+ alt.Tooltip('term_in_months:Q', title='Term (Months)'),
241
+ alt.Tooltip('activity:N', title='Activity'),
242
+ alt.Tooltip('sector:N', title='Sector'),
243
+ alt.Tooltip('region:N', title='Region')
244
+ ]
245
+ ).properties(
246
+ title='Reduced Data and Initial Centroids'
247
+ )
248
+
249
+ st.altair_chart(scatter_plot, use_container_width=True)
250
+
251
+ # Fix session states
252
+ if 'country_selected' not in st.session_state:
253
+ st.session_state['country_selected'] = None
254
+ if 'gender_selected' not in st.session_state:
255
+ st.session_state['gender_selected'] = None
256
+ if 'sector_selected' not in st.session_state:
257
+ st.session_state['sector_selected'] = None
258
+
259
+ # Recommendation Engine based on Country, Gender, and Sector
260
+ st.subheader("Loan Recommendation")
261
+
262
+ # Input for country
263
+ country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique())))
264
+ if country_input != "None":
265
+ # Filter gender options based on selected country
266
+ filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique()
267
+ gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders))
268
+ else:
269
+ gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique()))
270
+
271
+ # Input for sector based on the selected country and gender
272
+ if country_input != "None" and gender_input != "None":
273
+ # Filter sector options based on selected country and gender
274
+ filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique()
275
+ sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors))
276
+ else:
277
+ sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique()))
278
+
279
+ # Generate recommendations based on country, gender, and sector
280
+ if country_input != "None" and gender_input != "None" and sector_input != "None":
281
+ # Filter the DataFrame based on selected country, gender, and sector
282
+ user_filtered_df = df_new[
283
+ (df_new['country'] == country_input) &
284
+ (df_new['borrower_genders'] == gender_input) &
285
+ (df_new['sector'] == sector_input)
286
+ ].reset_index(drop=True)
287
+
288
+ if not user_filtered_df.empty:
289
+ # Align the filtered DataFrame's indices with the scaled data by resetting both
290
+ data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)]
291
+
292
+ # Compute similarity matrix for filtered data
293
+ similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered)
294
+
295
+ # Get the most similar loans (top 3)
296
+ similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4]
297
+
298
+ # Display recommended loans
299
+ recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']]
300
+ st.write("Recommended Loans:")
301
+ st.dataframe(recommendations)
302
+ else:
303
+ st.write("No matching loans found for the selected country, gender, and sector.")
304
+ else:
305
+ st.write("Please select a country, gender, and sector.")