In [1]:
import pandas as pd

books = pd.read_csv('data/books_cleaned.csv')
books.head()

Unnamed: 0,isbn13,full_title,authors,categories,description,full_desc,published_year,num_pages,average_rating,ratings_count,thumbnail
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,9780002005883 A NOVEL THAT READERS and critics...,2004.0,247.0,3.85,361.0,http://books.google.com/books/content?id=KQZCP...
1,9780002261982,Spider's Web: A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,9780002261982 A new 'Christie for Christmas' -...,2000.0,241.0,3.83,5164.0,http://books.google.com/books/content?id=gA5GP...
2,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...","9780006178736 A memorable, mesmerizing heroine...",1993.0,512.0,3.93,29532.0,http://books.google.com/books/content?id=FKo2T...
3,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,9780006280897 Lewis' work on the nature of lov...,2002.0,170.0,4.15,33684.0,http://books.google.com/books/content?id=XhQ5X...
4,9780006280934,The Problem of Pain,Clive Staples Lewis,Christian life,"""In The Problem of Pain, C.S. Lewis, one of th...","9780006280934 ""In The Problem of Pain, C.S. Le...",2002.0,176.0,4.09,37569.0,http://books.google.com/books/content?id=Kk-uV...


In [3]:
books['categories'].value_counts().reset_index().query('count >= 50').T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
categories,Fiction,Juvenile Fiction,Biography & Autobiography,History,Literary Criticism,Religion,Philosophy,Comics & Graphic Novels,Drama,Juvenile Nonfiction,Science,Poetry,Literary Collections
count,2111,390,311,207,124,117,117,116,86,57,56,51,50


In [4]:
category_mapping = {
    'Fiction'                  : "Fiction",
    'Juvenile Fiction'         : "Children's Fiction",
    'Biography & Autobiography': "Nonfiction",
    'History'                  : "Nonfiction",
    'Literary Criticism'       : "Nonfiction",
    'Philosophy'               : "Nonfiction",
    'Religion'                 : "Nonfiction",
    'Comics & Graphic Novels'  : "Fiction",
    'Drama'                    : "Fiction",
    'Juvenile Nonfiction'      : "Children's Nonfiction",
    'Science'                  : "Nonfiction",
    'Poetry'                   : "Fiction"
}

books['simple_categories'] = books['categories'].map(category_mapping)

print(f"Total books: {len(books)}")
print(f"Books with simple categories: {len(books[~(books['simple_categories'].isna())])}")
books['simple_categories'].value_counts().reset_index()

Total books: 5197
Books with simple categories: 3743


Unnamed: 0,simple_categories,count
0,Fiction,2364
1,Nonfiction,932
2,Children's Fiction,390
3,Children's Nonfiction,57


In [5]:
from transformers import pipeline

fiction_categories = ['Fiction', 'Nonfiction']
pipe               = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0)

Device set to use cuda:0


In [6]:
import numpy as np

def generate_predictions(sequence, categories):
    predictions = pipe(sequence, categories)
    max_index   = np.argmax(predictions['scores'])
    max_label   = predictions['labels'][max_index]
    return max_label

In [8]:
from tqdm import tqdm

# Calculate accuracy for 500 books for each category
demo_500_accuracy = {}
for label in ['Fiction', 'Nonfiction']:
    correct = 0
    descs = books.loc[books['simple_categories'] == label, 'description'].reset_index(drop=True)[:500]

    for desc in tqdm(descs, desc=f'Calculating accuracy for 500 {label} books'):
        predicted_label = generate_predictions(desc, fiction_categories)
        if predicted_label == label:
            correct += 1

    accuracy = correct / len(descs)
    demo_500_accuracy[label] = accuracy
    
# Calculate macro average accuracy
demo_500_accuracy['total'] = sum(demo_500_accuracy.values()) / len(demo_500_accuracy)
demo_500_accuracy

Calculating accuracy for 500 Fiction books: 100%|██████████| 500/500 [00:42<00:00, 11.89it/s]
Calculating accuracy for 500 Nonfiction books: 100%|██████████| 500/500 [00:42<00:00, 11.70it/s]


{'Fiction': 0.674, 'Nonfiction': 0.866, 'total': 0.77}

In [9]:
# Predict categories for books without simple categories

isbns, preds  = [], []
non_cat_books = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)

for i in tqdm(range(len(non_cat_books)), desc="Predicting for books without simple categories"):
    sequence = non_cat_books.loc[i, 'description']
    
    isbns.append(non_cat_books.loc[i, 'isbn13'])
    preds.append(generate_predictions(sequence, fiction_categories))

Predicting for books without simple categories: 100%|██████████| 1454/1454 [02:13<00:00, 10.93it/s]


In [10]:
# Create predicted books dataframe
preds_df    = pd.DataFrame({'isbn13': isbns, 'predicted_categories': preds})
books_with_cat = pd.merge(books, preds_df, on='isbn13', how='left')
books_with_cat.head()

Unnamed: 0,isbn13,full_title,authors,categories,description,full_desc,published_year,num_pages,average_rating,ratings_count,thumbnail,simple_categories,predicted_categories
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,9780002005883 A NOVEL THAT READERS and critics...,2004.0,247.0,3.85,361.0,http://books.google.com/books/content?id=KQZCP...,Fiction,
1,9780002261982,Spider's Web: A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,9780002261982 A new 'Christie for Christmas' -...,2000.0,241.0,3.83,5164.0,http://books.google.com/books/content?id=gA5GP...,,Fiction
2,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...","9780006178736 A memorable, mesmerizing heroine...",1993.0,512.0,3.93,29532.0,http://books.google.com/books/content?id=FKo2T...,Fiction,
3,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,9780006280897 Lewis' work on the nature of lov...,2002.0,170.0,4.15,33684.0,http://books.google.com/books/content?id=XhQ5X...,,Nonfiction
4,9780006280934,The Problem of Pain,Clive Staples Lewis,Christian life,"""In The Problem of Pain, C.S. Lewis, one of th...","9780006280934 ""In The Problem of Pain, C.S. Le...",2002.0,176.0,4.09,37569.0,http://books.google.com/books/content?id=Kk-uV...,,Nonfiction


In [11]:
books_with_cat['final_categories'] = np.where(
    books_with_cat['predicted_categories'].isna(),
    books_with_cat['simple_categories'],
    books_with_cat['predicted_categories']
)
books_with_cat = books_with_cat.drop(columns=['simple_categories', 'predicted_categories'])

books_with_cat.to_csv('data/books_with_categories.csv', index=False)
books_with_cat.head()

Unnamed: 0,isbn13,full_title,authors,categories,description,full_desc,published_year,num_pages,average_rating,ratings_count,thumbnail,final_categories
0,9780002005883,Gilead,Marilynne Robinson,Fiction,A NOVEL THAT READERS and critics have been eag...,9780002005883 A NOVEL THAT READERS and critics...,2004.0,247.0,3.85,361.0,http://books.google.com/books/content?id=KQZCP...,Fiction
1,9780002261982,Spider's Web: A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,A new 'Christie for Christmas' -- a full-lengt...,9780002261982 A new 'Christie for Christmas' -...,2000.0,241.0,3.83,5164.0,http://books.google.com/books/content?id=gA5GP...,Fiction
2,9780006178736,Rage of angels,Sidney Sheldon,Fiction,"A memorable, mesmerizing heroine Jennifer -- b...","9780006178736 A memorable, mesmerizing heroine...",1993.0,512.0,3.93,29532.0,http://books.google.com/books/content?id=FKo2T...,Fiction
3,9780006280897,The Four Loves,Clive Staples Lewis,Christian life,Lewis' work on the nature of love divides love...,9780006280897 Lewis' work on the nature of lov...,2002.0,170.0,4.15,33684.0,http://books.google.com/books/content?id=XhQ5X...,Nonfiction
4,9780006280934,The Problem of Pain,Clive Staples Lewis,Christian life,"""In The Problem of Pain, C.S. Lewis, one of th...","9780006280934 ""In The Problem of Pain, C.S. Le...",2002.0,176.0,4.09,37569.0,http://books.google.com/books/content?id=Kk-uV...,Nonfiction
