File size: 11,173 Bytes
1f70e3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# Import required packages
import pandas as pd
import numpy as np
import altair as alt
import streamlit as st
import matplotlib.pyplot as plt
from scipy.stats import zscore
import streamlit as st
import altair as alt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

df_new= pd.read_csv('Final_data.csv')
# Page title
st.title("Exploratory Data Analysis on Kiva Loans")
st.sidebar.header("Filters")

# Filter for Country
country = df_new['country'].unique()
selected_country = st.sidebar.selectbox("Select Country", country.tolist())
if selected_country:
    filtered_df = df_new[df_new['country'] == selected_country]
else:
    st.warning("Please select a country from the sidebar")
    st.stop()

# Filter for Gender 
borrower_genders = df_new['borrower_genders'].unique()
selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist())
filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)]

# Filter for Loan Amount
min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max())
selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan))
filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])]

# Filter for Years 
filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
years = sorted(filtered_df['year'].unique())
selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years)
filtered_df = filtered_df[filtered_df['year'].isin(selected_years)]

# selected filters
st.caption(f"Data for Country: {selected_country} | Gender: {', '.join(selected_genders)} | Loan Amount: {selected_loan_amount} | Years: {', '.join(map(str, selected_years))}")


# Distribution of Loan Sector
st.subheader('Distribution of Loan Sector')
sector_chart = alt.Chart(filtered_df).mark_bar().encode(
    x=alt.X('count(sector):Q', title='Count'),
    y=alt.Y('sector:N', sort='-x', title='Sector'),
    color=alt.Color('sector:N', legend=None)
).properties(
    width=600,
    height=400   
)
st.altair_chart(sector_chart)

# Distribution of Loan Term 
st.subheader('Distribution of Loan Term (in Months)')
term_hist = alt.Chart(filtered_df).mark_bar().encode(
    x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'),
    y=alt.Y('count():Q', title='Frequency'),
    color=alt.Color('term_in_months:Q', legend=None)
).properties(
    width=600,
    height=400
)
st.altair_chart(term_hist)

# Monthly Loan Amounts Over Time
st.subheader('Monthly Loan Amounts Over Time')
filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month
filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b')
filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year

monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index()

loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode(
    x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'),
    y=alt.Y('loan_amount:Q', title='Total Loan Amount'),
    color=alt.Color('year:N', title='Year'),
    tooltip=['year', 'month_name', 'loan_amount']
).properties(
    width=700,
    height=400
)

st.altair_chart(loan_time_series)

# Top 10 Countries with Highest Average Loan Amount
st.subheader('Top 10 Countries with Highest Average Loan Amount')
df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')]
df_clean['country'] = df_clean['country'].str.strip()
top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index()

top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode(
    x=alt.X('loan_amount:Q', title='Average Loan Amount'),
    y=alt.Y('country:N', sort='-x', title='Country'),
    color=alt.Color('country:N', legend=None)
).properties(
    width=600,
    height=400
)
st.altair_chart(top_10_chart)

# Distribution of Genders 
st.subheader('Distribution of Borrower Genders')
gender_counts = filtered_df['borrower_genders'].value_counts().reset_index()
gender_counts.columns = ['borrower_genders', 'count']

gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode(
    theta=alt.Theta(field="count", type="quantitative"),
    color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"),
    tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")]
).properties(
    width=400,
    height=400
)

# text labels to the doughnut chart 
gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode(
    text=alt.Text('count:Q', format='.0f')
)

final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend(
    labelFontSize=12,
    titleFontSize=14
)
st.altair_chart(final_chart)


# Dataset Summary
st.header('Dataset Summary')
st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2)))
st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2)))
st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0]))
st.write(filtered_df.describe())

# Filtered dataframe
st.header("Filtered Data")
st.dataframe(filtered_df)


st.header('K-Means Clustering')
 
#the columns we want to do kmean to
filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']]
 
#to determine scaler
fig, ax = plt.subplots(figsize=(10, 5))
filtered_df_reduced.hist(bins=100, ax=ax)
st.pyplot(fig)
 
#my chosen scaler
scaler = MinMaxScaler()
 
data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced)
 
Sum_of_squared_distances = []
 
K = range(1, 10)
 
for k in K:
    km = KMeans(n_clusters=k, n_init = "auto")
    km.fit(data_to_cluster_scaled)
    Sum_of_squared_distances.append(km.inertia_)
 
fig, ax = plt.subplots()
ax.plot(K, Sum_of_squared_distances, 'bx-')
ax.set_xlabel('Number of Clusters (k)')
ax.set_ylabel('Sum of Squared Distances')
ax.set_title('Elbow Method For Optimal k')
ax.grid(True)
 
st.pyplot(fig)
 
 
 
def k_means_simple(data, k, max_iters=100):
    centroids = data[np.random.choice(data.shape[0], k, replace=False)]
 
    for _ in range(max_iters):
        distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2)
        labels = np.argmin(distances, axis=0)
 
        new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])
 
        if np.all(centroids == new_centroids):
            break
 
        centroids = new_centroids
 
    return labels, centroids
 
labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5)
 
 
distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2)
nearest_centroid_indices = np.argmin(distances, axis=1)
 
data_df = pd.DataFrame({
    'x': data_to_cluster_scaled[:, 0],
    'y': data_to_cluster_scaled[:, 1],
    'centroid': nearest_centroid_indices
})
 
 
centroids_df = pd.DataFrame({
    'x': final_centroids[:, 0],
    'y': final_centroids[:, 1],
    'centroid': range(final_centroids.shape[0])
})
 
 
data_df['type'] = 'data'
centroids_df['type'] = 'centroid'
 
data_df['loan_amount'] = filtered_df['loan_amount'].values
data_df['term_in_months'] = filtered_df['term_in_months'].values
data_df['activity'] = filtered_df['activity'].values
data_df['sector'] = filtered_df['sector'].values
data_df['region'] = filtered_df['region'].values
 
combined_df = pd.concat([data_df, centroids_df])
 
scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode(
    x='x',
    y='y',
    color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')),
    opacity=alt.condition(
        alt.datum.type == 'data',  
        alt.value(0.6),
        alt.value(1)
    ),
    tooltip=[
        alt.Tooltip('loan_amount:Q', title='Loan Amount'),
        alt.Tooltip('term_in_months:Q', title='Term (Months)'),
        alt.Tooltip('activity:N', title='Activity'),
        alt.Tooltip('sector:N', title='Sector'),
        alt.Tooltip('region:N', title='Region')
    ]
).properties(
    title='Reduced Data and Initial Centroids'
)
 
st.altair_chart(scatter_plot, use_container_width=True)

# Fix session states
if 'country_selected' not in st.session_state:
    st.session_state['country_selected'] = None
if 'gender_selected' not in st.session_state:
    st.session_state['gender_selected'] = None
if 'sector_selected' not in st.session_state:
    st.session_state['sector_selected'] = None

# Recommendation Engine based on Country, Gender, and Sector
st.subheader("Loan Recommendation")

# Input for country
country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique())))
if country_input != "None":
    # Filter gender options based on selected country
    filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique()
    gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders))
else:
    gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique()))

# Input for sector based on the selected country and gender
if country_input != "None" and gender_input != "None":
    # Filter sector options based on selected country and gender
    filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique()
    sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors))
else:
    sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique()))

# Generate recommendations based on country, gender, and sector
if country_input != "None" and gender_input != "None" and sector_input != "None":
    # Filter the DataFrame based on selected country, gender, and sector
    user_filtered_df = df_new[
        (df_new['country'] == country_input) &
        (df_new['borrower_genders'] == gender_input) &
        (df_new['sector'] == sector_input)
    ].reset_index(drop=True)
    
    if not user_filtered_df.empty:
        # Align the filtered DataFrame's indices with the scaled data by resetting both
        data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)]

        # Compute similarity matrix for filtered data
        similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered)
        
        # Get the most similar loans (top 3)
        similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4]
        
        # Display recommended loans
        recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']]
        st.write("Recommended Loans:")
        st.dataframe(recommendations)
    else:
        st.write("No matching loans found for the selected country, gender, and sector.")
else:
    st.write("Please select a country, gender, and sector.")