Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from streamlit.logger import get_logger | |
| from timeit import default_timer as timer | |
| import sqlite3 | |
| import pandas as pd | |
| LOGGER = get_logger(__name__) | |
| def preprocess(s:str)->str: | |
| return s.replace('"','').replace('ืขื','').replace('ืคืจืง','').replace('ืคืกืืง','').replace('ืืฃ','').replace('ืขืืื','').replace('ืกืืื','').replace('ืกืขืืฃ','').replace('ืืืืืฉื','').replace("'",'') | |
| def get_dfs()->object: | |
| print('hello from get_dfs..') | |
| # //get the books table// | |
| # Connect to the database | |
| conn = sqlite3.connect('test42.db') | |
| # Query the database and retrieve the results | |
| cursor = conn.execute("SELECT * FROM titles") | |
| results = cursor.fetchall() | |
| # Convert the query results into a Pandas DataFrame | |
| titles = pd.DataFrame(results) | |
| titles.columns=list(map(lambda x: x[0], cursor.description)) | |
| # //get the texts table// | |
| # Query the database and retrieve the results | |
| cursor = conn.execute("SELECT * FROM texts") | |
| results = cursor.fetchall() | |
| # Convert the query results into a Pandas DataFrame | |
| texts = pd.DataFrame(results) | |
| texts.columns=list(map(lambda x: x[0], cursor.description)) | |
| # //get the references database | |
| # Query the database and retrieve the results | |
| cursor = conn.execute("SELECT * FROM refs") | |
| results = cursor.fetchall() | |
| # Convert the query results into a Pandas DataFrame | |
| refs = pd.DataFrame(results) | |
| refs.columns=list(map(lambda x: x[0], cursor.description)) | |
| # Query the database and retrieve the results | |
| cursor = conn.execute("SELECT * FROM books") | |
| results = cursor.fetchall() | |
| # Convert the query results into a Pandas DataFrame | |
| books = pd.DataFrame(list(results)) | |
| books.columns=list(map(lambda x: x[0], cursor.description)) | |
| #merge the books and refs with the texts | |
| merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id') | |
| texts_df = pd.merge(merged,refs,left_on='_id_x',right_on='tid') | |
| titles_df = titles | |
| return titles_df, texts_df | |
| def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm): | |
| from rapidfuzz import fuzz, process as rapidfuzz_process | |
| from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio | |
| print('hello from find_ref..') | |
| if not input_text: return | |
| print(eval(algorithm)) | |
| results = [] | |
| books = titles_df['he_titles'] | |
| input_text = input_text.replace(':','ืขืืื ื').replace('.','ืขืืื ื') | |
| scorer = eval(algorithm) | |
| # search only the references database in case the user set the top_k to 0 | |
| if top_k == 0: | |
| refs = texts_df['ref_text_long'].unique() | |
| for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess): | |
| results += [{'ref':ref,'ref_score':ref_score}] | |
| else: | |
| # search first only in the books database (for top_k books) | |
| for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess): | |
| # get all the references of that book | |
| book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0] | |
| refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique() | |
| # then search these references and add them all to the results | |
| for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess): | |
| results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}] | |
| # finaly, sort all the references by their own score (and not the book score) | |
| results.sort(key=lambda x: x['ref_score'],reverse=True) | |
| return results[:num_of_results] | |
| def run(): | |
| st.set_page_config( | |
| page_title=" ืืืคืืฉ ืืงืืจืืช", | |
| page_icon="๐", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| get_dfs() | |
| st.write("# ืืืคืืฉ ืืงืืจืืช ืืืืฆืขืืช ืืจืืง ืืืื ืฉืืืื") | |
| titles_df,texts_df = get_dfs() | |
| user_input = st.text_input('ืืชืื ืืช ืืืงืืจ ืืืืืงืฉ', placeholder='ืืื ืงืื ืืฃ ื ืขืืื ื') | |
| top_k = st.sidebar.slider('ืืื ืกืคืจืื ืืกืจืืง top_k:',0,20,10) | |
| num_of_results = st.sidebar.slider('ืืกืคืจ ืืชืืฆืืืช ืฉืืจืฆืื ื ืืืฆืื:',1,25,5) | |
| algorithm = st.sidebar.selectbox('ืืืืืืจืืชื ืืืืจืื ืืชืืฆืืืช',['token_ratio','ratio','WRatio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio']) | |
| if user_input!="": | |
| time0 = timer() | |
| results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm) | |
| time = f"finished in {1e3*(timer()-time0):.1f} ms" | |
| st.write(time) | |
| buttons = [] | |
| for i, result in enumerate(results): | |
| st.write(result) | |
| buttons.append(st.button("ืคืชื " +result['ref'],i)) | |
| if buttons[i]: | |
| st.write(texts_df.loc[texts_df['ref_text_long']==result['ref']][['heText','ref_text_long']]) | |
| if __name__ == "__main__": | |
| run() | |