{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13full_titleauthorscategoriesdescriptionfull_descpublished_yearnum_pagesaverage_ratingratings_countthumbnail
09780002005883GileadMarilynne RobinsonFictionA NOVEL THAT READERS and critics have been eag...9780002005883 A NOVEL THAT READERS and critics...2004.0247.03.85361.0http://books.google.com/books/content?id=KQZCP...
19780002261982Spider's Web: A NovelCharles Osborne;Agatha ChristieDetective and mystery storiesA new 'Christie for Christmas' -- a full-lengt...9780002261982 A new 'Christie for Christmas' -...2000.0241.03.835164.0http://books.google.com/books/content?id=gA5GP...
29780006178736Rage of angelsSidney SheldonFictionA memorable, mesmerizing heroine Jennifer -- b...9780006178736 A memorable, mesmerizing heroine...1993.0512.03.9329532.0http://books.google.com/books/content?id=FKo2T...
39780006280897The Four LovesClive Staples LewisChristian lifeLewis' work on the nature of love divides love...9780006280897 Lewis' work on the nature of lov...2002.0170.04.1533684.0http://books.google.com/books/content?id=XhQ5X...
49780006280934The Problem of PainClive Staples LewisChristian life\"In The Problem of Pain, C.S. Lewis, one of th...9780006280934 \"In The Problem of Pain, C.S. Le...2002.0176.04.0937569.0http://books.google.com/books/content?id=Kk-uV...
\n", "
" ], "text/plain": [ " isbn13 full_title authors \\\n", "0 9780002005883 Gilead Marilynne Robinson \n", "1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n", "2 9780006178736 Rage of angels Sidney Sheldon \n", "3 9780006280897 The Four Loves Clive Staples Lewis \n", "4 9780006280934 The Problem of Pain Clive Staples Lewis \n", "\n", " categories \\\n", "0 Fiction \n", "1 Detective and mystery stories \n", "2 Fiction \n", "3 Christian life \n", "4 Christian life \n", "\n", " description \\\n", "0 A NOVEL THAT READERS and critics have been eag... \n", "1 A new 'Christie for Christmas' -- a full-lengt... \n", "2 A memorable, mesmerizing heroine Jennifer -- b... \n", "3 Lewis' work on the nature of love divides love... \n", "4 \"In The Problem of Pain, C.S. Lewis, one of th... \n", "\n", " full_desc published_year \\\n", "0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n", "1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n", "2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n", "3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n", "4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n", "\n", " num_pages average_rating ratings_count \\\n", "0 247.0 3.85 361.0 \n", "1 241.0 3.83 5164.0 \n", "2 512.0 3.93 29532.0 \n", "3 170.0 4.15 33684.0 \n", "4 176.0 4.09 37569.0 \n", "\n", " thumbnail \n", "0 http://books.google.com/books/content?id=KQZCP... \n", "1 http://books.google.com/books/content?id=gA5GP... \n", "2 http://books.google.com/books/content?id=FKo2T... \n", "3 http://books.google.com/books/content?id=XhQ5X... \n", "4 http://books.google.com/books/content?id=Kk-uV... " ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "books = pd.read_csv('data/books_cleaned.csv')\n", "books.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789101112
categoriesFictionJuvenile FictionBiography & AutobiographyHistoryLiterary CriticismReligionPhilosophyComics & Graphic NovelsDramaJuvenile NonfictionSciencePoetryLiterary Collections
count21113903112071241171171168657565150
\n", "
" ], "text/plain": [ " 0 1 2 3 \\\n", "categories Fiction Juvenile Fiction Biography & Autobiography History \n", "count 2111 390 311 207 \n", "\n", " 4 5 6 7 \\\n", "categories Literary Criticism Religion Philosophy Comics & Graphic Novels \n", "count 124 117 117 116 \n", "\n", " 8 9 10 11 12 \n", "categories Drama Juvenile Nonfiction Science Poetry Literary Collections \n", "count 86 57 56 51 50 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books['categories'].value_counts().reset_index().query('count >= 50').T" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total books: 5197\n", "Books with simple categories: 3743\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
simple_categoriescount
0Fiction2364
1Nonfiction932
2Children's Fiction390
3Children's Nonfiction57
\n", "
" ], "text/plain": [ " simple_categories count\n", "0 Fiction 2364\n", "1 Nonfiction 932\n", "2 Children's Fiction 390\n", "3 Children's Nonfiction 57" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "category_mapping = {\n", " 'Fiction' : \"Fiction\",\n", " 'Juvenile Fiction' : \"Children's Fiction\",\n", " 'Biography & Autobiography': \"Nonfiction\",\n", " 'History' : \"Nonfiction\",\n", " 'Literary Criticism' : \"Nonfiction\",\n", " 'Philosophy' : \"Nonfiction\",\n", " 'Religion' : \"Nonfiction\",\n", " 'Comics & Graphic Novels' : \"Fiction\",\n", " 'Drama' : \"Fiction\",\n", " 'Juvenile Nonfiction' : \"Children's Nonfiction\",\n", " 'Science' : \"Nonfiction\",\n", " 'Poetry' : \"Fiction\"\n", "}\n", "\n", "books['simple_categories'] = books['categories'].map(category_mapping)\n", "\n", "print(f\"Total books: {len(books)}\")\n", "print(f\"Books with simple categories: {len(books[~(books['simple_categories'].isna())])}\")\n", "books['simple_categories'].value_counts().reset_index()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Device set to use cuda:0\n" ] } ], "source": [ "from transformers import pipeline\n", "\n", "fiction_categories = ['Fiction', 'Nonfiction']\n", "pipe = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "def generate_predictions(sequence, categories):\n", " predictions = pipe(sequence, categories)\n", " max_index = np.argmax(predictions['scores'])\n", " max_label = predictions['labels'][max_index]\n", " return max_label" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Calculating accuracy for 500 Fiction books: 100%|██████████| 500/500 [00:42<00:00, 11.89it/s]\n", "Calculating accuracy for 500 Nonfiction books: 100%|██████████| 500/500 [00:42<00:00, 11.70it/s]\n" ] }, { "data": { "text/plain": [ "{'Fiction': 0.674, 'Nonfiction': 0.866, 'total': 0.77}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from tqdm import tqdm\n", "\n", "# Calculate accuracy for 500 books for each category\n", "demo_500_accuracy = {}\n", "for label in ['Fiction', 'Nonfiction']:\n", " correct = 0\n", " descs = books.loc[books['simple_categories'] == label, 'description'].reset_index(drop=True)[:500]\n", "\n", " for desc in tqdm(descs, desc=f'Calculating accuracy for 500 {label} books'):\n", " predicted_label = generate_predictions(desc, fiction_categories)\n", " if predicted_label == label:\n", " correct += 1\n", "\n", " accuracy = correct / len(descs)\n", " demo_500_accuracy[label] = accuracy\n", " \n", "# Calculate macro average accuracy\n", "demo_500_accuracy['total'] = sum(demo_500_accuracy.values()) / len(demo_500_accuracy)\n", "demo_500_accuracy" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Predicting for books without simple categories: 100%|██████████| 1454/1454 [02:13<00:00, 10.93it/s]\n" ] } ], "source": [ "# Predict categories for books without simple categories\n", "\n", "isbns, preds = [], []\n", "non_cat_books = books.loc[books['simple_categories'].isna(), ['isbn13', 'description']].reset_index(drop=True)\n", "\n", "for i in tqdm(range(len(non_cat_books)), desc=\"Predicting for books without simple categories\"):\n", " sequence = non_cat_books.loc[i, 'description']\n", " \n", " isbns.append(non_cat_books.loc[i, 'isbn13'])\n", " preds.append(generate_predictions(sequence, fiction_categories))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13full_titleauthorscategoriesdescriptionfull_descpublished_yearnum_pagesaverage_ratingratings_countthumbnailsimple_categoriespredicted_categories
09780002005883GileadMarilynne RobinsonFictionA NOVEL THAT READERS and critics have been eag...9780002005883 A NOVEL THAT READERS and critics...2004.0247.03.85361.0http://books.google.com/books/content?id=KQZCP...FictionNaN
19780002261982Spider's Web: A NovelCharles Osborne;Agatha ChristieDetective and mystery storiesA new 'Christie for Christmas' -- a full-lengt...9780002261982 A new 'Christie for Christmas' -...2000.0241.03.835164.0http://books.google.com/books/content?id=gA5GP...NaNFiction
29780006178736Rage of angelsSidney SheldonFictionA memorable, mesmerizing heroine Jennifer -- b...9780006178736 A memorable, mesmerizing heroine...1993.0512.03.9329532.0http://books.google.com/books/content?id=FKo2T...FictionNaN
39780006280897The Four LovesClive Staples LewisChristian lifeLewis' work on the nature of love divides love...9780006280897 Lewis' work on the nature of lov...2002.0170.04.1533684.0http://books.google.com/books/content?id=XhQ5X...NaNNonfiction
49780006280934The Problem of PainClive Staples LewisChristian life\"In The Problem of Pain, C.S. Lewis, one of th...9780006280934 \"In The Problem of Pain, C.S. Le...2002.0176.04.0937569.0http://books.google.com/books/content?id=Kk-uV...NaNNonfiction
\n", "
" ], "text/plain": [ " isbn13 full_title authors \\\n", "0 9780002005883 Gilead Marilynne Robinson \n", "1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n", "2 9780006178736 Rage of angels Sidney Sheldon \n", "3 9780006280897 The Four Loves Clive Staples Lewis \n", "4 9780006280934 The Problem of Pain Clive Staples Lewis \n", "\n", " categories \\\n", "0 Fiction \n", "1 Detective and mystery stories \n", "2 Fiction \n", "3 Christian life \n", "4 Christian life \n", "\n", " description \\\n", "0 A NOVEL THAT READERS and critics have been eag... \n", "1 A new 'Christie for Christmas' -- a full-lengt... \n", "2 A memorable, mesmerizing heroine Jennifer -- b... \n", "3 Lewis' work on the nature of love divides love... \n", "4 \"In The Problem of Pain, C.S. Lewis, one of th... \n", "\n", " full_desc published_year \\\n", "0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n", "1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n", "2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n", "3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n", "4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n", "\n", " num_pages average_rating ratings_count \\\n", "0 247.0 3.85 361.0 \n", "1 241.0 3.83 5164.0 \n", "2 512.0 3.93 29532.0 \n", "3 170.0 4.15 33684.0 \n", "4 176.0 4.09 37569.0 \n", "\n", " thumbnail simple_categories \\\n", "0 http://books.google.com/books/content?id=KQZCP... Fiction \n", "1 http://books.google.com/books/content?id=gA5GP... NaN \n", "2 http://books.google.com/books/content?id=FKo2T... Fiction \n", "3 http://books.google.com/books/content?id=XhQ5X... NaN \n", "4 http://books.google.com/books/content?id=Kk-uV... NaN \n", "\n", " predicted_categories \n", "0 NaN \n", "1 Fiction \n", "2 NaN \n", "3 Nonfiction \n", "4 Nonfiction " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create predicted books dataframe\n", "preds_df = pd.DataFrame({'isbn13': isbns, 'predicted_categories': preds})\n", "books_with_cat = pd.merge(books, preds_df, on='isbn13', how='left')\n", "books_with_cat.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
isbn13full_titleauthorscategoriesdescriptionfull_descpublished_yearnum_pagesaverage_ratingratings_countthumbnailfinal_categories
09780002005883GileadMarilynne RobinsonFictionA NOVEL THAT READERS and critics have been eag...9780002005883 A NOVEL THAT READERS and critics...2004.0247.03.85361.0http://books.google.com/books/content?id=KQZCP...Fiction
19780002261982Spider's Web: A NovelCharles Osborne;Agatha ChristieDetective and mystery storiesA new 'Christie for Christmas' -- a full-lengt...9780002261982 A new 'Christie for Christmas' -...2000.0241.03.835164.0http://books.google.com/books/content?id=gA5GP...Fiction
29780006178736Rage of angelsSidney SheldonFictionA memorable, mesmerizing heroine Jennifer -- b...9780006178736 A memorable, mesmerizing heroine...1993.0512.03.9329532.0http://books.google.com/books/content?id=FKo2T...Fiction
39780006280897The Four LovesClive Staples LewisChristian lifeLewis' work on the nature of love divides love...9780006280897 Lewis' work on the nature of lov...2002.0170.04.1533684.0http://books.google.com/books/content?id=XhQ5X...Nonfiction
49780006280934The Problem of PainClive Staples LewisChristian life\"In The Problem of Pain, C.S. Lewis, one of th...9780006280934 \"In The Problem of Pain, C.S. Le...2002.0176.04.0937569.0http://books.google.com/books/content?id=Kk-uV...Nonfiction
\n", "
" ], "text/plain": [ " isbn13 full_title authors \\\n", "0 9780002005883 Gilead Marilynne Robinson \n", "1 9780002261982 Spider's Web: A Novel Charles Osborne;Agatha Christie \n", "2 9780006178736 Rage of angels Sidney Sheldon \n", "3 9780006280897 The Four Loves Clive Staples Lewis \n", "4 9780006280934 The Problem of Pain Clive Staples Lewis \n", "\n", " categories \\\n", "0 Fiction \n", "1 Detective and mystery stories \n", "2 Fiction \n", "3 Christian life \n", "4 Christian life \n", "\n", " description \\\n", "0 A NOVEL THAT READERS and critics have been eag... \n", "1 A new 'Christie for Christmas' -- a full-lengt... \n", "2 A memorable, mesmerizing heroine Jennifer -- b... \n", "3 Lewis' work on the nature of love divides love... \n", "4 \"In The Problem of Pain, C.S. Lewis, one of th... \n", "\n", " full_desc published_year \\\n", "0 9780002005883 A NOVEL THAT READERS and critics... 2004.0 \n", "1 9780002261982 A new 'Christie for Christmas' -... 2000.0 \n", "2 9780006178736 A memorable, mesmerizing heroine... 1993.0 \n", "3 9780006280897 Lewis' work on the nature of lov... 2002.0 \n", "4 9780006280934 \"In The Problem of Pain, C.S. Le... 2002.0 \n", "\n", " num_pages average_rating ratings_count \\\n", "0 247.0 3.85 361.0 \n", "1 241.0 3.83 5164.0 \n", "2 512.0 3.93 29532.0 \n", "3 170.0 4.15 33684.0 \n", "4 176.0 4.09 37569.0 \n", "\n", " thumbnail final_categories \n", "0 http://books.google.com/books/content?id=KQZCP... Fiction \n", "1 http://books.google.com/books/content?id=gA5GP... Fiction \n", "2 http://books.google.com/books/content?id=FKo2T... Fiction \n", "3 http://books.google.com/books/content?id=XhQ5X... Nonfiction \n", "4 http://books.google.com/books/content?id=Kk-uV... Nonfiction " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "books_with_cat['final_categories'] = np.where(\n", " books_with_cat['predicted_categories'].isna(),\n", " books_with_cat['simple_categories'],\n", " books_with_cat['predicted_categories']\n", ")\n", "books_with_cat = books_with_cat.drop(columns=['simple_categories', 'predicted_categories'])\n", "\n", "books_with_cat.to_csv('data/books_with_categories.csv', index=False)\n", "books_with_cat.head()" ] } ], "metadata": { "kernelspec": { "display_name": "book_rcm", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }