Spaces:

sufianahmad513
/

assign

Sleeping

App Files Files Community

sufianahmad513 commited on Nov 20, 2024

Commit

1f70e3d

verified ·

1 Parent(s): 4e7d6cc

Create app.py

Browse files

Files changed (1) hide show

app.py +305 -0

app.py ADDED Viewed

	@@ -0,0 +1,305 @@

+Copy of abc.py
+# Import required packages
+import pandas as pd
+import numpy as np
+import altair as alt
+import streamlit as st
+import matplotlib.pyplot as plt
+from scipy.stats import zscore
+import streamlit as st
+import altair as alt
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics.pairwise import cosine_similarity
+df_new= pd.read_csv('Final_data.csv')
+# Page title
+st.title("Exploratory Data Analysis on Kiva Loans")
+st.sidebar.header("Filters")
+# Filter for Country
+country = df_new['country'].unique()
+selected_country = st.sidebar.selectbox("Select Country", country.tolist())
+if selected_country:
+    filtered_df = df_new[df_new['country'] == selected_country]
+else:
+    st.warning("Please select a country from the sidebar")
+    st.stop()
+# Filter for Gender
+borrower_genders = df_new['borrower_genders'].unique()
+selected_genders = st.sidebar.multiselect("Select Gender", borrower_genders.tolist(), default=borrower_genders.tolist())
+filtered_df = filtered_df[filtered_df['borrower_genders'].isin(selected_genders)]
+# Filter for Loan Amount
+min_loan, max_loan = float(df_new['loan_amount'].min()), float(df_new['loan_amount'].max())
+selected_loan_amount = st.sidebar.slider("Select Loan Amount", min_value=min_loan, max_value=max_loan, value=(min_loan, max_loan))
+filtered_df = filtered_df[(filtered_df['loan_amount'] >= selected_loan_amount[0]) & (filtered_df['loan_amount'] <= selected_loan_amount[1])]
+# Filter for Years
+filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
+years = sorted(filtered_df['year'].unique())
+selected_years = st.sidebar.multiselect("Select Year(s)", years, default=years)
+filtered_df = filtered_df[filtered_df['year'].isin(selected_years)]
+# selected filters
+st.caption(f"Data for Country: {selected_country} | Gender: {', '.join(selected_genders)} | Loan Amount: {selected_loan_amount} | Years: {', '.join(map(str, selected_years))}")
+# Distribution of Loan Sector
+st.subheader('Distribution of Loan Sector')
+sector_chart = alt.Chart(filtered_df).mark_bar().encode(
+    x=alt.X('count(sector):Q', title='Count'),
+    y=alt.Y('sector:N', sort='-x', title='Sector'),
+    color=alt.Color('sector:N', legend=None)
+).properties(
+    width=600,
+    height=400
+)
+st.altair_chart(sector_chart)
+# Distribution of Loan Term
+st.subheader('Distribution of Loan Term (in Months)')
+term_hist = alt.Chart(filtered_df).mark_bar().encode(
+    x=alt.X('term_in_months:Q', bin=alt.Bin(maxbins=30), title='Term in Months'),
+    y=alt.Y('count():Q', title='Frequency'),
+    color=alt.Color('term_in_months:Q', legend=None)
+).properties(
+    width=600,
+    height=400
+)
+st.altair_chart(term_hist)
+# Monthly Loan Amounts Over Time
+st.subheader('Monthly Loan Amounts Over Time')
+filtered_df['month'] = pd.to_datetime(filtered_df['date']).dt.month
+filtered_df['month_name'] = pd.to_datetime(filtered_df['date']).dt.strftime('%b')
+filtered_df['year'] = pd.to_datetime(filtered_df['date']).dt.year
+monthly_loan_amount = filtered_df.groupby(['year', 'month_name', 'month'])['loan_amount'].sum().reset_index()
+loan_time_series = alt.Chart(monthly_loan_amount).mark_line(point=True).encode(
+    x=alt.X('month_name:N', sort=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], title='Month'),
+    y=alt.Y('loan_amount:Q', title='Total Loan Amount'),
+    color=alt.Color('year:N', title='Year'),
+    tooltip=['year', 'month_name', 'loan_amount']
+).properties(
+    width=700,
+    height=400
+)
+st.altair_chart(loan_time_series)
+# Top 10 Countries with Highest Average Loan Amount
+st.subheader('Top 10 Countries with Highest Average Loan Amount')
+df_clean = df_new[df_new['country'].notna() & (df_new['country'].str.strip() != '')]
+df_clean['country'] = df_clean['country'].str.strip()
+top_10_countries_avg_loan = df_new.groupby('country')['loan_amount'].mean().nlargest(10).reset_index()
+top_10_chart = alt.Chart(top_10_countries_avg_loan).mark_bar().encode(
+    x=alt.X('loan_amount:Q', title='Average Loan Amount'),
+    y=alt.Y('country:N', sort='-x', title='Country'),
+    color=alt.Color('country:N', legend=None)
+).properties(
+    width=600,
+    height=400
+)
+st.altair_chart(top_10_chart)
+# Distribution of Genders
+st.subheader('Distribution of Borrower Genders')
+gender_counts = filtered_df['borrower_genders'].value_counts().reset_index()
+gender_counts.columns = ['borrower_genders', 'count']
+gender_doughnut_chart = alt.Chart(gender_counts).mark_arc(innerRadius=80, outerRadius=120).encode(
+    theta=alt.Theta(field="count", type="quantitative"),
+    color=alt.Color(field="borrower_genders", type="nominal", title="Borrower Genders"),
+    tooltip=[alt.Tooltip('borrower_genders:N', title="Gender"), alt.Tooltip('count:Q', title="Count")]
+).properties(
+    width=400,
+    height=400
+)
+# text labels to the doughnut chart
+gender_doughnut_text = gender_doughnut_chart.mark_text(radius=150, size=15).encode(
+    text=alt.Text('count:Q', format='.0f')
+)
+final_chart = alt.layer(gender_doughnut_chart, gender_doughnut_text).configure_legend(
+    labelFontSize=12,
+    titleFontSize=14
+)
+st.altair_chart(final_chart)
+# Dataset Summary
+st.header('Dataset Summary')
+st.caption('Mean Loan Amount: ' + str(round(filtered_df['loan_amount'].mean(), 2)))
+st.caption('Median Loan Amount: ' + str(round(filtered_df['loan_amount'].median(), 2)))
+st.caption('Mode Loan Amount: ' + str(filtered_df['loan_amount'].mode()[0]))
+st.write(filtered_df.describe())
+# Filtered dataframe
+st.header("Filtered Data")
+st.dataframe(filtered_df)
+st.header('K-Means Clustering')
+#the columns we want to do kmean to
+filtered_df_reduced = filtered_df[['loan_amount', 'term_in_months']]
+#to determine scaler
+fig, ax = plt.subplots(figsize=(10, 5))
+filtered_df_reduced.hist(bins=100, ax=ax)
+st.pyplot(fig)
+#my chosen scaler
+scaler = MinMaxScaler()
+data_to_cluster_scaled = scaler.fit_transform(filtered_df_reduced)
+Sum_of_squared_distances = []
+K = range(1, 10)
+for k in K:
+    km = KMeans(n_clusters=k, n_init = "auto")
+    km.fit(data_to_cluster_scaled)
+    Sum_of_squared_distances.append(km.inertia_)
+fig, ax = plt.subplots()
+ax.plot(K, Sum_of_squared_distances, 'bx-')
+ax.set_xlabel('Number of Clusters (k)')
+ax.set_ylabel('Sum of Squared Distances')
+ax.set_title('Elbow Method For Optimal k')
+ax.grid(True)
+st.pyplot(fig)
+def k_means_simple(data, k, max_iters=100):
+    centroids = data[np.random.choice(data.shape[0], k, replace=False)]
+    for _ in range(max_iters):
+        distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2)
+        labels = np.argmin(distances, axis=0)
+        new_centroids = np.array([data[labels == i].mean(axis=0) for i in range(k)])
+        if np.all(centroids == new_centroids):
+            break
+        centroids = new_centroids
+    return labels, centroids
+labels, final_centroids = k_means_simple(data_to_cluster_scaled, 5)
+distances = np.linalg.norm(data_to_cluster_scaled[:, np.newaxis] - final_centroids, axis=2)
+nearest_centroid_indices = np.argmin(distances, axis=1)
+data_df = pd.DataFrame({
+    'x': data_to_cluster_scaled[:, 0],
+    'y': data_to_cluster_scaled[:, 1],
+    'centroid': nearest_centroid_indices
+})
+centroids_df = pd.DataFrame({
+    'x': final_centroids[:, 0],
+    'y': final_centroids[:, 1],
+    'centroid': range(final_centroids.shape[0])
+})
+data_df['type'] = 'data'
+centroids_df['type'] = 'centroid'
+data_df['loan_amount'] = filtered_df['loan_amount'].values
+data_df['term_in_months'] = filtered_df['term_in_months'].values
+data_df['activity'] = filtered_df['activity'].values
+data_df['sector'] = filtered_df['sector'].values
+data_df['region'] = filtered_df['region'].values
+combined_df = pd.concat([data_df, centroids_df])
+scatter_plot = alt.Chart(combined_df).mark_circle(size=60).encode(
+    x='x',
+    y='y',
+    color=alt.Color('centroid:N', scale=alt.Scale(scheme='category10')),
+    opacity=alt.condition(
+        alt.datum.type == 'data',
+        alt.value(0.6),
+        alt.value(1)
+    ),
+    tooltip=[
+        alt.Tooltip('loan_amount:Q', title='Loan Amount'),
+        alt.Tooltip('term_in_months:Q', title='Term (Months)'),
+        alt.Tooltip('activity:N', title='Activity'),
+        alt.Tooltip('sector:N', title='Sector'),
+        alt.Tooltip('region:N', title='Region')
+    ]
+).properties(
+    title='Reduced Data and Initial Centroids'
+)
+st.altair_chart(scatter_plot, use_container_width=True)
+# Fix session states
+if 'country_selected' not in st.session_state:
+    st.session_state['country_selected'] = None
+if 'gender_selected' not in st.session_state:
+    st.session_state['gender_selected'] = None
+if 'sector_selected' not in st.session_state:
+    st.session_state['sector_selected'] = None
+# Recommendation Engine based on Country, Gender, and Sector
+st.subheader("Loan Recommendation")
+# Input for country
+country_input = st.selectbox("Select Country", ["None"] + sorted(list(df_new['country'].unique())))
+if country_input != "None":
+    # Filter gender options based on selected country
+    filtered_genders = df_new[df_new['country'] == country_input]['borrower_genders'].unique()
+    gender_input = st.selectbox("Select Gender", ["None"] + list(filtered_genders))
+else:
+    gender_input = st.selectbox("Select Gender", ["None"] + list(df_new['borrower_genders'].unique()))
+# Input for sector based on the selected country and gender
+if country_input != "None" and gender_input != "None":
+    # Filter sector options based on selected country and gender
+    filtered_sectors = df_new[(df_new['country'] == country_input) & (df_new['borrower_genders'] == gender_input)]['sector'].unique()
+    sector_input = st.selectbox("Select Sector", ["None"] + list(filtered_sectors))
+else:
+    sector_input = st.selectbox("Select Sector", ["None"] + list(df_new['sector'].unique()))
+# Generate recommendations based on country, gender, and sector
+if country_input != "None" and gender_input != "None" and sector_input != "None":
+    # Filter the DataFrame based on selected country, gender, and sector
+    user_filtered_df = df_new[
+        (df_new['country'] == country_input) &
+        (df_new['borrower_genders'] == gender_input) &
+        (df_new['sector'] == sector_input)
+    ].reset_index(drop=True)
+    if not user_filtered_df.empty:
+        # Align the filtered DataFrame's indices with the scaled data by resetting both
+        data_to_cluster_scaled_filtered = data_to_cluster_scaled[:len(user_filtered_df)]
+        # Compute similarity matrix for filtered data
+        similarity_matrix = cosine_similarity(data_to_cluster_scaled_filtered)
+        # Get the most similar loans (top 3)
+        similar_loans_indices = np.argsort(similarity_matrix[0])[::-1][1:4]
+        # Display recommended loans
+        recommendations = user_filtered_df.iloc[similar_loans_indices][['country', 'borrower_genders', 'sector', 'loan_amount', 'term_in_months']]
+        st.write("Recommended Loans:")
+        st.dataframe(recommendations)
+    else:
+        st.write("No matching loans found for the selected country, gender, and sector.")
+else:
+    st.write("Please select a country, gender, and sector.")