Spaces:
Running
Running
Commit
·
7149684
1
Parent(s):
56aaa78
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,12 +7,29 @@ import re
|
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
import time
|
| 9 |
from joblib import Parallel, delayed
|
| 10 |
-
|
| 11 |
|
| 12 |
@st.cache_data
|
| 13 |
def convert_df(df):
|
| 14 |
return df.to_csv()
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
def extract_website_domain(url):
|
| 17 |
parsed_url = urlparse(url)
|
| 18 |
return parsed_url.netloc
|
|
@@ -50,7 +67,7 @@ def google_address(address):
|
|
| 50 |
|
| 51 |
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
| 52 |
df['Description']=df['Description'].bfill()
|
| 53 |
-
df['Address']=df['Title'].str.extract(r'(.+? \d{5})')
|
| 54 |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
| 55 |
df['Website'] = df['Link'].apply(extract_website_domain)
|
| 56 |
|
|
@@ -64,12 +81,15 @@ def google_address(address):
|
|
| 64 |
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
|
| 65 |
|
| 66 |
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
df_final=df[df['Address'].notnull()]
|
| 69 |
-
df_final=df_final[(df_final['Address'].str.contains(str(address_number))) & (df_final['Address'].str.contains(str(address_zip)))]
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
|
|
|
| 73 |
|
| 74 |
def catch_errors(addresses):
|
| 75 |
try:
|
|
@@ -84,18 +104,14 @@ def process_multiple_address(addresses):
|
|
| 84 |
|
| 85 |
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
st.set_page_config(layout="wide")
|
| 90 |
-
# col1, col2 = st.columns((2))
|
| 91 |
-
address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
|
| 92 |
|
| 93 |
address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
|
| 94 |
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
| 95 |
-
|
|
|
|
| 96 |
return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
|
| 97 |
|
| 98 |
-
|
| 99 |
if address_file == 'File' and not None:
|
| 100 |
try:
|
| 101 |
df = pd.read_csv(uploaded_file)
|
|
@@ -110,30 +126,38 @@ if address_file == 'File' and not None:
|
|
| 110 |
|
| 111 |
results= process_multiple_address(df['Address All'].values)
|
| 112 |
results=pd.concat(results).reset_index(drop=1)
|
| 113 |
-
results.index=results.index+1
|
| 114 |
|
| 115 |
else:
|
| 116 |
results=google_address(address).reset_index(drop=1)
|
| 117 |
-
results.index=results.index+1
|
| 118 |
|
| 119 |
|
| 120 |
-
|
| 121 |
-
results=results[['Address Input', 'Address', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
|
| 122 |
'Link', 'Description',
|
| 123 |
]]
|
| 124 |
|
|
|
|
|
|
|
| 125 |
if return_sq=='Yes':
|
| 126 |
results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
|
| 127 |
-
results.index=results.index+1
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
csv2 = convert_df(results)
|
| 139 |
st.download_button(
|
|
|
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
import time
|
| 9 |
from joblib import Parallel, delayed
|
| 10 |
+
from nltk import ngrams
|
| 11 |
|
| 12 |
@st.cache_data
|
| 13 |
def convert_df(df):
|
| 14 |
return df.to_csv()
|
| 15 |
|
| 16 |
+
def normalize_string(string):
|
| 17 |
+
normalized_string = string.lower()
|
| 18 |
+
normalized_string = re.sub(r'[^\w\s]', '', normalized_string)
|
| 19 |
+
|
| 20 |
+
return normalized_string
|
| 21 |
+
|
| 22 |
+
def jaccard_similarity(string1, string2,n = 2, normalize=True):
|
| 23 |
+
|
| 24 |
+
if normalize:
|
| 25 |
+
string1,string2= normalize_string(string1),normalize_string(string2)
|
| 26 |
+
|
| 27 |
+
grams1 = set(ngrams(string1, n))
|
| 28 |
+
grams2 = set(ngrams(string2, n))
|
| 29 |
+
similarity = len(grams1.intersection(grams2)) / len(grams1.union(grams2))
|
| 30 |
+
return similarity
|
| 31 |
+
|
| 32 |
+
|
| 33 |
def extract_website_domain(url):
|
| 34 |
parsed_url = urlparse(url)
|
| 35 |
return parsed_url.netloc
|
|
|
|
| 67 |
|
| 68 |
df=pd.DataFrame(texts_links_des,columns=['Title','Link','Description'])
|
| 69 |
df['Description']=df['Description'].bfill()
|
| 70 |
+
df['Address Output']=df['Title'].str.extract(r'(.+? \d{5})')
|
| 71 |
df['Link']=[i[7:i.find('&sa=')] for i in df['Link']]
|
| 72 |
df['Website'] = df['Link'].apply(extract_website_domain)
|
| 73 |
|
|
|
|
| 81 |
df['Baths']=df['Baths'].str.extract(r'([\d.]+)').astype(float)
|
| 82 |
|
| 83 |
df['Year Built']=df['Description'].str.extract(r"built in (\d{4})")
|
| 84 |
+
df['Match Percent']=[jaccard_similarity(address,i)*100 for i in df['Address Output']]
|
| 85 |
+
|
| 86 |
|
| 87 |
+
# df_final=df[df['Address Output'].notnull()]
|
| 88 |
+
# df_final=df_final[(df_final['Address Output'].str.contains(str(address_number))) & (df_final['Address Output'].str.contains(str(address_zip)))]
|
| 89 |
|
| 90 |
+
df.insert(0,'Address Input',address)
|
| 91 |
+
|
| 92 |
+
return df
|
| 93 |
|
| 94 |
def catch_errors(addresses):
|
| 95 |
try:
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
|
|
|
|
|
|
|
| 107 |
st.set_page_config(layout="wide")
|
|
|
|
|
|
|
| 108 |
|
| 109 |
address = st.sidebar.text_input("Address", "190 Pebble Creek Dr Etna, OH 43062")
|
| 110 |
uploaded_file = st.sidebar.file_uploader("Choose a file")
|
| 111 |
+
address_file = st.sidebar.radio('Choose',('Single Address', 'File'))
|
| 112 |
+
match_percent = st.sidebar.selectbox('Address Match Percentage At Least:',(70, 80, 90, 100, 0))
|
| 113 |
return_sq = st.sidebar.radio('Return Only Results with Square Footage',('No', 'Yes'))
|
| 114 |
|
|
|
|
| 115 |
if address_file == 'File' and not None:
|
| 116 |
try:
|
| 117 |
df = pd.read_csv(uploaded_file)
|
|
|
|
| 126 |
|
| 127 |
results= process_multiple_address(df['Address All'].values)
|
| 128 |
results=pd.concat(results).reset_index(drop=1)
|
| 129 |
+
# results.index=results.index+1
|
| 130 |
|
| 131 |
else:
|
| 132 |
results=google_address(address).reset_index(drop=1)
|
| 133 |
+
# results.index=results.index+1
|
| 134 |
|
| 135 |
|
| 136 |
+
results=results[['Address Input', 'Address Output','Match Percent', 'Website','Square Footage', 'Beds', 'Baths', 'Year Built',
|
|
|
|
| 137 |
'Link', 'Description',
|
| 138 |
]]
|
| 139 |
|
| 140 |
+
results=results.query(f"`Match Percent`>={match_percent}")
|
| 141 |
+
|
| 142 |
if return_sq=='Yes':
|
| 143 |
results=results.query("`Square Footage`==`Square Footage`").reset_index(drop=1)
|
| 144 |
+
# results.index=results.index+1
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
with st.container():
|
| 149 |
+
|
| 150 |
+
st.dataframe(
|
| 151 |
+
results,
|
| 152 |
+
column_config={
|
| 153 |
+
|
| 154 |
+
"Link": st.column_config.LinkColumn("Link"),
|
| 155 |
+
'Match Percent': st.column_config.NumberColumn(format='%.2f %%'),
|
| 156 |
+
},
|
| 157 |
+
hide_index=True,
|
| 158 |
+
# height=500,
|
| 159 |
+
# width=500,
|
| 160 |
+
)
|
| 161 |
|
| 162 |
csv2 = convert_df(results)
|
| 163 |
st.download_button(
|