import numpy as np import pandas as pd import gradio as gr import torch from langchain_community.document_loaders import TextLoader from langchain_text_splitters import CharacterTextSplitter from langchain_huggingface import HuggingFaceEmbeddings from langchain_chroma import Chroma books = pd.read_csv('data/books_with_emotions.csv') # Book Thumbnail books['large_thumbnail'] = books['thumbnail'] + '&fife=w800' books['large_thumbnail'] = np.where( books['large_thumbnail'].isna(), 'cover-not-found.jpg', books['large_thumbnail'] ) # Create Vector Database raw_docs = TextLoader('./data/full_desc.txt', encoding='utf-8').load() text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator='\n') docs = text_splitter.split_documents(raw_docs) embeddings = HuggingFaceEmbeddings( model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'} ) database = Chroma.from_documents(docs, embeddings) # Retrieval def retrieval(query: str, category: str=None, tone: str=None, init_top_k: int=80, final_top_k: int=16) -> pd.DataFrame: # recs = database.similarity_search_with_score(query, k=init_top_k) recs = database.similarity_search(query, k=init_top_k) ids = [int(rec.page_content.strip('"').split()[0]) for rec in recs] matches = books[books['isbn13'].isin(ids)] if category != 'All': matches = matches[matches['final_categories'] == category] matches = matches.head(final_top_k) if tone == 'Happy': matches.sort_values(by='joy', ascending=False, inplace=True) elif tone == 'Surprising': matches.sort_values(by='surprise', ascending=False, inplace=True) elif tone == 'Angry': matches.sort_values(by='anger', ascending=False, inplace=True) elif tone == 'Suspenseful': matches.sort_values(by='fear', ascending=False, inplace=True) elif tone == 'Sad': matches.sort_values(by='sadness', ascending=False, inplace=True) return matches # Recommendation def recommend(query: str, category: str, tone: str): recs = retrieval(query, category, tone) results = [] for _, row in recs.iterrows(): description = ' '.join(row['description'].split()[:30]) + '...' authors = row['authors'].split(';') if len(authors) == 2: authors_str = authors[0] + ' and ' + authors[1] elif len(authors) > 2: authors_str = ', '.join(authors[:-1]) + ' and ' + authors[-1] else: authors_str = authors[0] caption = f"{row['full_title']} by {authors_str}: {description}" results.append([ row['large_thumbnail'], caption, ]) return results # Dashboard categories = ['All'] + sorted(books['final_categories'].unique()) tones = ['All'] + ['Happy', 'Surprising', 'Angry', 'Suspenseful', 'Sad'] with gr.Blocks(theme=gr.themes.Glass()) as dashboard: gr.Markdown('# Semantics Book Recommendation System') with gr.Row(): user_query = gr.Textbox( label='Please enter the description of the book you want to read', placeholder='e.g. A story about a boy who ...', ) category = gr.Dropdown( choices=categories, label='Select a category', value='All' ) tone = gr.Dropdown( choices=tones, label='Select an emotional tone', value='All' ) btn = gr.Button('Find books') gr.Markdown('## Recommendations') output = gr.Gallery( label='Recommended Books', columns=8, rows=2 ) btn.click( fn=recommend, inputs=[user_query, category, tone], outputs=output ) if __name__ == '__main__': dashboard.launch(share=True)