YuITC
Rename to app.py
42a1c5a
import numpy as np
import pandas as pd
import gradio as gr
import torch
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
books = pd.read_csv('data/books_with_emotions.csv')
# Book Thumbnail
books['large_thumbnail'] = books['thumbnail'] + '&fife=w800'
books['large_thumbnail'] = np.where(
books['large_thumbnail'].isna(),
'cover-not-found.jpg',
books['large_thumbnail']
)
# Create Vector Database
raw_docs = TextLoader('./data/full_desc.txt', encoding='utf-8').load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator='\n')
docs = text_splitter.split_documents(raw_docs)
embeddings = HuggingFaceEmbeddings(
model_name='sentence-transformers/all-MiniLM-L6-v2',
model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)
database = Chroma.from_documents(docs, embeddings)
# Retrieval
def retrieval(query: str, category: str=None, tone: str=None, init_top_k: int=80, final_top_k: int=16) -> pd.DataFrame:
# recs = database.similarity_search_with_score(query, k=init_top_k)
recs = database.similarity_search(query, k=init_top_k)
ids = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
matches = books[books['isbn13'].isin(ids)]
if category != 'All':
matches = matches[matches['final_categories'] == category]
matches = matches.head(final_top_k)
if tone == 'Happy':
matches.sort_values(by='joy', ascending=False, inplace=True)
elif tone == 'Surprising':
matches.sort_values(by='surprise', ascending=False, inplace=True)
elif tone == 'Angry':
matches.sort_values(by='anger', ascending=False, inplace=True)
elif tone == 'Suspenseful':
matches.sort_values(by='fear', ascending=False, inplace=True)
elif tone == 'Sad':
matches.sort_values(by='sadness', ascending=False, inplace=True)
return matches
# Recommendation
def recommend(query: str, category: str, tone: str):
recs = retrieval(query, category, tone)
results = []
for _, row in recs.iterrows():
description = ' '.join(row['description'].split()[:30]) + '...'
authors = row['authors'].split(';')
if len(authors) == 2:
authors_str = authors[0] + ' and ' + authors[1]
elif len(authors) > 2:
authors_str = ', '.join(authors[:-1]) + ' and ' + authors[-1]
else:
authors_str = authors[0]
caption = f"{row['full_title']} by {authors_str}: {description}"
results.append([
row['large_thumbnail'],
caption,
])
return results
# Dashboard
categories = ['All'] + sorted(books['final_categories'].unique())
tones = ['All'] + ['Happy', 'Surprising', 'Angry', 'Suspenseful', 'Sad']
with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
gr.Markdown('# Semantics Book Recommendation System')
with gr.Row():
user_query = gr.Textbox(
label='Please enter the description of the book you want to read',
placeholder='e.g. A story about a boy who ...',
)
category = gr.Dropdown(
choices=categories,
label='Select a category',
value='All'
)
tone = gr.Dropdown(
choices=tones,
label='Select an emotional tone',
value='All'
)
btn = gr.Button('Find books')
gr.Markdown('## Recommendations')
output = gr.Gallery(
label='Recommended Books',
columns=8, rows=2
)
btn.click(
fn=recommend,
inputs=[user_query, category, tone],
outputs=output
)
if __name__ == '__main__':
dashboard.launch(share=True)