# Import required packages import pandas as pd import numpy as np import altair as alt import streamlit as st import matplotlib.pyplot as plt from scipy.stats import zscore import streamlit as st import altair as alt from sklearn.cluster import KMeans from sklearn.preprocessing import MinMaxScaler from sklearn.metrics.pairwise import cosine_similarity df_new= pd.read_csv('Final_data.csv') # Page title st.title("Exploratory Data Analysis on Kiva Loans") st.sidebar.header("Filters") # Filter for Country country = df_new['country'].unique() selected_country = st.sidebar.selectbox("Select Country", country.tolist()) if selected_country: filtered_df = df_new[df_new['country'] == selected_country] else: st.warning("Please select a country from the sidebar") st.stop() # Filter for Gender borrower_genders = df_new['borrower_genders'].unique() selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist()) filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)] # Filter for Loan Amount min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max()) selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan)) filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])] # Filter for Years filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year years = sorted(filtered_df['year'].unique()) selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years) filtered_df = filtered_df[filtered_df['year'].isin(selected_years)] # selected filters st.caption(f"Data for Country: {selected_country} | Gender: {', '.join(selected_genders)} | Loan Amount: {selected_loan_amount} | Years: {', '.join(map(str, selected_years))}") # Distribution of Loan Sector st.subheader('Distribution of Loan Sector') sector_chart = alt.Chart(filtered_df).mark_bar().encode( x=alt.X('count(sector):Q', title='Count'), y=alt.Y('sector:N', sort='-x', title='Sector'), color=alt.Color('sector:N', legend=None) ).properties( width=600, height=400 ) st.altair_chart(sector_chart) # Distribution of Loan Term st.subheader('Distribution of Loan Term (in Months)') term_hist = alt.Chart(filtered_df).mark_bar().encode( x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'), y=alt.Y('count():Q', title='Frequency'), color=alt.Color('term_in_months:Q', legend=None) ).properties( width=600, height=400 ) st.altair_chart(term_hist) # Monthly Loan Amounts Over Time st.subheader('Monthly Loan Amounts Over Time') filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b') filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index() loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode( x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'), y=alt.Y('loan_amount:Q', title='Total Loan Amount'), color=alt.Color('year:N', title='Year'), tooltip=['year', 'month_name', 'loan_amount'] ).properties( width=700, height=400 ) st.altair_chart(loan_time_series) # Top 10 Countries with Highest Average Loan Amount st.subheader('Top 10 Countries with Highest Average Loan Amount') df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')] df_clean['country'] = df_clean['country'].str.strip() top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index() top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode( x=alt.X('loan_amount:Q', title='Average Loan Amount'), y=alt.Y('country:N', sort='-x', title='Country'), color=alt.Color('country:N', legend=None) ).properties( width=600, height=400 ) st.altair_chart(top_10_chart) # Distribution of Genders st.subheader('Distribution of Borrower Genders') gender_counts = filtered_df['borrower_genders'].value_counts().reset_index() gender_counts.columns = ['borrower_genders', 'count'] gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode( theta=alt.Theta(field="count", type="quantitative"), color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"), tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")] ).properties( width=400, height=400 ) # text labels to the doughnut chart gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode( text=alt.Text('count:Q', format='.0f') ) final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend( labelFontSize=12, titleFontSize=14 ) st.altair_chart(final_chart) # Dataset Summary st.header('Dataset Summary') st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2))) st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2))) st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0])) st.write(filtered_df.describe()) # Filtered dataframe st.header("Filtered Data") st.dataframe(filtered_df) st.header('K-Means Clustering') #the columns we want to do kmean to filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']] #to determine scaler fig, ax = plt.subplots(figsize=(10, 5)) filtered_df_reduced.hist(bins=100, ax=ax) st.pyplot(fig) #my chosen scaler scaler = MinMaxScaler() data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced) Sum_of_squared_distances = [] K = range(1, 10) for k in K: km = KMeans(n_clusters=k, n_init = "auto") km.fit(data_to_cluster_scaled) Sum_of_squared_distances.append(km.inertia_) fig, ax = plt.subplots() ax.plot(K, Sum_of_squared_distances, 'bx-') ax.set_xlabel('Number of Clusters (k)') ax.set_ylabel('Sum of Squared Distances') ax.set_title('Elbow Method For Optimal k') ax.grid(True) st.pyplot(fig) def k_means_simple(data, k, max_iters=100): centroids = data[np.random.choice(data.shape[0], k, replace=False)] for _ in range(max_iters): distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2) labels = np.argmin(distances, axis=0) new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)]) if np.all(centroids == new_centroids): break centroids = new_centroids return labels, centroids labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5) distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2) nearest_centroid_indices = np.argmin(distances, axis=1) data_df = pd.DataFrame({ 'x': data_to_cluster_scaled[:, 0], 'y': data_to_cluster_scaled[:, 1], 'centroid': nearest_centroid_indices }) centroids_df = pd.DataFrame({ 'x': final_centroids[:, 0], 'y': final_centroids[:, 1], 'centroid': range(final_centroids.shape[0]) }) data_df['type'] = 'data' centroids_df['type'] = 'centroid' data_df['loan_amount'] = filtered_df['loan_amount'].values data_df['term_in_months'] = filtered_df['term_in_months'].values data_df['activity'] = filtered_df['activity'].values data_df['sector'] = filtered_df['sector'].values data_df['region'] = filtered_df['region'].values combined_df = pd.concat([data_df, centroids_df]) scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode( x='x', y='y', color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')), opacity=alt.condition( alt.datum.type == 'data', alt.value(0.6), alt.value(1) ), tooltip=[ alt.Tooltip('loan_amount:Q', title='Loan Amount'), alt.Tooltip('term_in_months:Q', title='Term (Months)'), alt.Tooltip('activity:N', title='Activity'), alt.Tooltip('sector:N', title='Sector'), alt.Tooltip('region:N', title='Region') ] ).properties( title='Reduced Data and Initial Centroids' ) st.altair_chart(scatter_plot, use_container_width=True) # Fix session states if 'country_selected' not in st.session_state: st.session_state['country_selected'] = None if 'gender_selected' not in st.session_state: st.session_state['gender_selected'] = None if 'sector_selected' not in st.session_state: st.session_state['sector_selected'] = None # Recommendation Engine based on Country, Gender, and Sector st.subheader("Loan Recommendation") # Input for country country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique()))) if country_input != "None": # Filter gender options based on selected country filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique() gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders)) else: gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique())) # Input for sector based on the selected country and gender if country_input != "None" and gender_input != "None": # Filter sector options based on selected country and gender filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique() sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors)) else: sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique())) # Generate recommendations based on country, gender, and sector if country_input != "None" and gender_input != "None" and sector_input != "None": # Filter the DataFrame based on selected country, gender, and sector user_filtered_df = df_new[ (df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input) & (df_new['sector'] == sector_input) ].reset_index(drop=True) if not user_filtered_df.empty: # Align the filtered DataFrame's indices with the scaled data by resetting both data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)] # Compute similarity matrix for filtered data similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered) # Get the most similar loans (top 3) similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4] # Display recommended loans recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']] st.write("Recommended Loans:") st.dataframe(recommendations) else: st.write("No matching loans found for the selected country, gender, and sector.") else: st.write("Please select a country, gender, and sector.")