Spaces:
Sleeping
Sleeping
# Import required packages | |
import pandas as pd | |
import numpy as np | |
import altair as alt | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
from scipy.stats import zscore | |
import streamlit as st | |
import altair as alt | |
from sklearn.cluster import KMeans | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.metrics.pairwise import cosine_similarity | |
df_new= pd.read_csv('Final_data.csv') | |
# Page title | |
st.title("Exploratory Data Analysis on Kiva Loans") | |
st.sidebar.header("Filters") | |
# Filter for Country | |
country = df_new['country'].unique() | |
selected_country = st.sidebar.selectbox("Select Country", country.tolist()) | |
if selected_country: | |
filtered_df = df_new[df_new['country'] == selected_country] | |
else: | |
st.warning("Please select a country from the sidebar") | |
st.stop() | |
# Filter for Gender | |
borrower_genders = df_new['borrower_genders'].unique() | |
selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist()) | |
filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)] | |
# Filter for Loan Amount | |
min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max()) | |
selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan)) | |
filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])] | |
# Filter for Years | |
filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year | |
years = sorted(filtered_df['year'].unique()) | |
selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years) | |
filtered_df = filtered_df[filtered_df['year'].isin(selected_years)] | |
# selected filters | |
st.caption(f"Data for Country: {selected_country} | Gender: {', '.join(selected_genders)} | Loan Amount: {selected_loan_amount} | Years: {', '.join(map(str, selected_years))}") | |
# Distribution of Loan Sector | |
st.subheader('Distribution of Loan Sector') | |
sector_chart = alt.Chart(filtered_df).mark_bar().encode( | |
x=alt.X('count(sector):Q', title='Count'), | |
y=alt.Y('sector:N', sort='-x', title='Sector'), | |
color=alt.Color('sector:N', legend=None) | |
).properties( | |
width=600, | |
height=400 | |
) | |
st.altair_chart(sector_chart) | |
# Distribution of Loan Term | |
st.subheader('Distribution of Loan Term (in Months)') | |
term_hist = alt.Chart(filtered_df).mark_bar().encode( | |
x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'), | |
y=alt.Y('count():Q', title='Frequency'), | |
color=alt.Color('term_in_months:Q', legend=None) | |
).properties( | |
width=600, | |
height=400 | |
) | |
st.altair_chart(term_hist) | |
# Monthly Loan Amounts Over Time | |
st.subheader('Monthly Loan Amounts Over Time') | |
filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month | |
filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b') | |
filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year | |
monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index() | |
loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode( | |
x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'), | |
y=alt.Y('loan_amount:Q', title='Total Loan Amount'), | |
color=alt.Color('year:N', title='Year'), | |
tooltip=['year', 'month_name', 'loan_amount'] | |
).properties( | |
width=700, | |
height=400 | |
) | |
st.altair_chart(loan_time_series) | |
# Top 10 Countries with Highest Average Loan Amount | |
st.subheader('Top 10 Countries with Highest Average Loan Amount') | |
df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')] | |
df_clean['country'] = df_clean['country'].str.strip() | |
top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index() | |
top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode( | |
x=alt.X('loan_amount:Q', title='Average Loan Amount'), | |
y=alt.Y('country:N', sort='-x', title='Country'), | |
color=alt.Color('country:N', legend=None) | |
).properties( | |
width=600, | |
height=400 | |
) | |
st.altair_chart(top_10_chart) | |
# Distribution of Genders | |
st.subheader('Distribution of Borrower Genders') | |
gender_counts = filtered_df['borrower_genders'].value_counts().reset_index() | |
gender_counts.columns = ['borrower_genders', 'count'] | |
gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode( | |
theta=alt.Theta(field="count", type="quantitative"), | |
color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"), | |
tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")] | |
).properties( | |
width=400, | |
height=400 | |
) | |
# text labels to the doughnut chart | |
gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode( | |
text=alt.Text('count:Q', format='.0f') | |
) | |
final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend( | |
labelFontSize=12, | |
titleFontSize=14 | |
) | |
st.altair_chart(final_chart) | |
# Dataset Summary | |
st.header('Dataset Summary') | |
st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2))) | |
st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2))) | |
st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0])) | |
st.write(filtered_df.describe()) | |
# Filtered dataframe | |
st.header("Filtered Data") | |
st.dataframe(filtered_df) | |
st.header('K-Means Clustering') | |
#the columns we want to do kmean to | |
filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']] | |
#to determine scaler | |
fig, ax = plt.subplots(figsize=(10, 5)) | |
filtered_df_reduced.hist(bins=100, ax=ax) | |
st.pyplot(fig) | |
#my chosen scaler | |
scaler = MinMaxScaler() | |
data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced) | |
Sum_of_squared_distances = [] | |
K = range(1, 10) | |
for k in K: | |
km = KMeans(n_clusters=k, n_init = "auto") | |
km.fit(data_to_cluster_scaled) | |
Sum_of_squared_distances.append(km.inertia_) | |
fig, ax = plt.subplots() | |
ax.plot(K, Sum_of_squared_distances, 'bx-') | |
ax.set_xlabel('Number of Clusters (k)') | |
ax.set_ylabel('Sum of Squared Distances') | |
ax.set_title('Elbow Method For Optimal k') | |
ax.grid(True) | |
st.pyplot(fig) | |
def k_means_simple(data, k, max_iters=100): | |
centroids = data[np.random.choice(data.shape[0], k, replace=False)] | |
for _ in range(max_iters): | |
distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2) | |
labels = np.argmin(distances, axis=0) | |
new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)]) | |
if np.all(centroids == new_centroids): | |
break | |
centroids = new_centroids | |
return labels, centroids | |
labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5) | |
distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2) | |
nearest_centroid_indices = np.argmin(distances, axis=1) | |
data_df = pd.DataFrame({ | |
'x': data_to_cluster_scaled[:, 0], | |
'y': data_to_cluster_scaled[:, 1], | |
'centroid': nearest_centroid_indices | |
}) | |
centroids_df = pd.DataFrame({ | |
'x': final_centroids[:, 0], | |
'y': final_centroids[:, 1], | |
'centroid': range(final_centroids.shape[0]) | |
}) | |
data_df['type'] = 'data' | |
centroids_df['type'] = 'centroid' | |
data_df['loan_amount'] = filtered_df['loan_amount'].values | |
data_df['term_in_months'] = filtered_df['term_in_months'].values | |
data_df['activity'] = filtered_df['activity'].values | |
data_df['sector'] = filtered_df['sector'].values | |
data_df['region'] = filtered_df['region'].values | |
combined_df = pd.concat([data_df, centroids_df]) | |
scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode( | |
x='x', | |
y='y', | |
color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')), | |
opacity=alt.condition( | |
alt.datum.type == 'data', | |
alt.value(0.6), | |
alt.value(1) | |
), | |
tooltip=[ | |
alt.Tooltip('loan_amount:Q', title='Loan Amount'), | |
alt.Tooltip('term_in_months:Q', title='Term (Months)'), | |
alt.Tooltip('activity:N', title='Activity'), | |
alt.Tooltip('sector:N', title='Sector'), | |
alt.Tooltip('region:N', title='Region') | |
] | |
).properties( | |
title='Reduced Data and Initial Centroids' | |
) | |
st.altair_chart(scatter_plot, use_container_width=True) | |
# Fix session states | |
if 'country_selected' not in st.session_state: | |
st.session_state['country_selected'] = None | |
if 'gender_selected' not in st.session_state: | |
st.session_state['gender_selected'] = None | |
if 'sector_selected' not in st.session_state: | |
st.session_state['sector_selected'] = None | |
# Recommendation Engine based on Country, Gender, and Sector | |
st.subheader("Loan Recommendation") | |
# Input for country | |
country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique()))) | |
if country_input != "None": | |
# Filter gender options based on selected country | |
filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique() | |
gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders)) | |
else: | |
gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique())) | |
# Input for sector based on the selected country and gender | |
if country_input != "None" and gender_input != "None": | |
# Filter sector options based on selected country and gender | |
filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique() | |
sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors)) | |
else: | |
sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique())) | |
# Generate recommendations based on country, gender, and sector | |
if country_input != "None" and gender_input != "None" and sector_input != "None": | |
# Filter the DataFrame based on selected country, gender, and sector | |
user_filtered_df = df_new[ | |
(df_new['country'] == country_input) & | |
(df_new['borrower_genders'] == gender_input) & | |
(df_new['sector'] == sector_input) | |
].reset_index(drop=True) | |
if not user_filtered_df.empty: | |
# Align the filtered DataFrame's indices with the scaled data by resetting both | |
data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)] | |
# Compute similarity matrix for filtered data | |
similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered) | |
# Get the most similar loans (top 3) | |
similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4] | |
# Display recommended loans | |
recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']] | |
st.write("Recommended Loans:") | |
st.dataframe(recommendations) | |
else: | |
st.write("No matching loans found for the selected country, gender, and sector.") | |
else: | |
st.write("Please select a country, gender, and sector.") |