Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import requests | |
| from urllib.parse import urlparse, quote | |
| import re | |
| from bs4 import BeautifulSoup | |
| import time | |
| from joblib import Parallel, delayed | |
| from nltk import ngrams | |
| def convert_df(df): | |
| return df.to_csv() | |
| def normalize_string(string): | |
| normalized_string = string.lower() | |
| normalized_string = re.sub(r'[^\w\s]', '', normalized_string) | |
| return normalized_string | |
| def jaccard_similarity(string1, string2,n = 2, normalize=True): | |
| if normalize: | |
| string1,string2= normalize_string(string1),normalize_string(string2) | |
| grams1 = set(ngrams(string1, n)) | |
| grams2 = set(ngrams(string2, n)) | |
| similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2)) | |
| return similarity | |
| def extract_website_domain(url): | |
| parsed_url = urlparse(url) | |
| return parsed_url.netloc | |
| def google_address(address): | |
| address_number = re.findall(r'\b\d+\b', address)[0] | |
| address_zip =re.search(r'(\d{5})$', address).group()[:2] | |
| search_query = quote(address) | |
| url=f'https://www.google.com/search?q={search_query}' | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| texts_links = [] | |
| for link in soup.find_all("a"): | |
| t,l=link.get_text(), link.get("href") | |
| if (l[:11]=='/url?q=http') and (len(t)>20 ): | |
| texts_links.append((t,l)) | |
| text = soup.get_text() | |
| texts_links_des=[] | |
| for i,t_l in enumerate(texts_links): | |
| start=text.find(texts_links[i][0][:50]) | |
| try: | |
| end=text.find(texts_links[i+1][0][:50]) | |
| except: | |
| end=text.find('Related searches') | |
| description=text[start:end] | |
| texts_links_des.append((t_l[0],t_l[1],description)) | |
| df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description']) | |
| df['Description']=df['Description'].bfill() | |
| df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})') | |
| df['Link']=[i[7:i.find('&sa=')] for i in df['Link']] | |
| df['Website'] = df['Link'].apply(extract_website_domain) | |
| df['Square Footage']=df['Description'].str.extract(r"((\d+) Square Feet|(\d+) sq. ft.|(\d+) sqft|(\d+) Sq. Ft.|(\d+) sq|(\d+(?:,\d+)?) Sq\. Ft\.|(\d+(?:,\d+)?) sq)")[0] | |
| df['Square Footage']=df['Square Footage'].replace({',':''},regex=True).str.replace(r'\D', '') | |
| df['Beds']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"(\d+) bed") | |
| df['Baths']=df['Description'].replace({'-':' ','total':''},regex=True).str.extract(r"((\d+) bath|(\d+(?:\.\d+)?) bath)")[0] | |
| df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float) | |
| df['Year Built']=df['Description'].str.extract(r"built in (\d{4})") | |
| df['Match Percent']=[jaccard_similarity(address,i)*100 for i in df['Address Output']] | |
| # df_final=df[df['Address Output'].notnull()] | |
| # df_final=df_final[(df_final['Address Output'].str.contains(str(address_number))) & (df_final['Address Output'].str.contains(str(address_zip)))] | |
| df.insert(0,'Address Input',address) | |
| return df | |
| def catch_errors(addresses): | |
| try: | |
| return google_address(addresses) | |
| except: | |
| return pd.DataFrame({'Address Input':[addresses]}) | |
| def process_multiple_address(addresses): | |
| results=Parallel(n_jobs=32, prefer="threads")(delayed(catch_errors)(i) for i in addresses) | |
| return results | |
| st.set_page_config(layout="wide") | |
| address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062") | |
| uploaded_file = st.sidebar.file_uploader("Choose a file") | |
| address_file = st.sidebar.radio('Choose',('Single Address', 'File')) | |
| match_percent = st.sidebar.selectbox('Address Match Percentage At Least:',(70, 80, 90, 100, 0)) | |
| return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes')) | |
| if address_file == 'File' and not None: | |
| try: | |
| df = pd.read_csv(uploaded_file) | |
| except: | |
| df = pd.read_excel(uploaded_file) | |
| address_cols=list(df.columns[:4]) | |
| df[address_cols[-1]]=df[address_cols[-1]].astype(str).str[:5].astype(int).astype(str) | |
| df[address_cols[-1]]=df[address_cols[-1]].apply(lambda x: x.zfill(5)) | |
| df['Address All']=df[address_cols[0]]+', '+df[address_cols[1]]+', '+df[address_cols[2]]+' '+df[address_cols[3]] | |
| results= process_multiple_address(df['Address All'].values) | |
| results=pd.concat(results).reset_index(drop=1) | |
| # results.index=results.index+1 | |
| else: | |
| results=google_address(address).reset_index(drop=1) | |
| # results.index=results.index+1 | |
| results=results[['Address Input', 'Address Output','Match Percent', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built', | |
| 'Link', 'Description', | |
| ]] | |
| results=results.query(f"`Match Percent`>={match_percent}") | |
| if return_sq=='Yes': | |
| results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1) | |
| # results.index=results.index+1 | |
| with st.container(): | |
| st.dataframe( | |
| results, | |
| column_config={ | |
| "Link": st.column_config.LinkColumn("Link"), | |
| 'Match Percent': st.column_config.NumberColumn(format='%.2f %%'), | |
| }, | |
| hide_index=True, | |
| # height=500, | |
| # width=500, | |
| ) | |
| csv2 = convert_df(results) | |
| st.download_button( | |
| label="Download data as CSV", | |
| data=csv2, | |
| file_name=f'{address}.csv', | |
| mime='text/csv') | |
| st.markdown(""" <style> | |
| #MainMenu {visibility: hidden;} | |
| footer {visibility: hidden;} | |
| </style> """, unsafe_allow_html=True) |