This project aims to predict the genre of a book given only its title. The following code was all written and designed by Allison Kahn but fit into a larger group project for UPenn's CIS 519 class, Applied Machine Learning. The models produced by all teammates were then combined into an ensemble method.
import csv
from google.colab import drive
drive.mount('/content/gdrive')
The data we are using for our project is UCSD's Book Graph dataset. The main dataset is stored in a JSON with 2.3M entries and 29 fields, comprised of a variety of datatypes from integers and strings to dictionaries and lists. This JSON is too large to load into memory on colab, so we will process through the JSON line by line, appending the relevant data to a csv 10,000 lines at a time.
import json
import gzip
import csv
fields_to_take = ['isbn', 'average_rating', 'description', 'link', 'authors',
'publisher', 'num_pages', 'isbn13', 'publication_year', 'image_url',
'book_id', 'title', 'title_without_series', 'language_code'] #these are the fields that we want to keep
dict_chuck = []
i = 0
first = True
for line in gzip.open("gdrive/MyDrive/CIS 519 Project/Data/Raw Data/goodreads_books.json.gz", 'r'):
json_line = json.loads(line.decode("utf-8"))
line_dict = {}
keep = False
for field in fields_to_take: #for each field we want, extract the data associated with it and store in dict
if field == "authors":
if json_line[field] == []:
out = ''
else:
out = json_line[field][0]['author_id']
#in the image_url datafield, Good Reads uses a default image when it doesn't have access to a cover; we want to remove these
elif field == "image_url" and json_line[field] == 'https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png':
out = ''
elif field == "language_code": #if the language is set to a langauage other than english, we aren't using it in our analysis
if json_line[field] == "eng" or json_line[field] == "":
keep = True
out = json_line[field]
else:
out = json_line[field]
line_dict[field] = out
if keep:
dict_chuck.append(line_dict)
i+=1
# append rows every 10000 lines
if i % 10000 == 0:
print(i)
if first:
with open('gdrive/MyDrive/CIS 519 Project/Data/goodreads_books_cleaned.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = fields_to_take)
writer.writeheader()
writer.writerows(dict_chuck)
first = False
dict_chuck = []
else:
with open('gdrive/MyDrive/CIS 519 Project/Data/goodreads_books_cleaned.csv', 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = line_dict.keys())
writer.writerows(dict_chuck)
dict_chuck = []
After we extract all of the data we want, we're left with the following csv with 1,768,097 rows and 14 columns:
import pandas as pd
df = pd.read_csv('gdrive/MyDrive/CIS 519 Project/Data/goodreads_books_cleaned.csv')
df.head()
The next task we have is to include the genres in our data. The UCSD data pulls genre from user tags in the following format:
import pandas as pd
genre_created = pd.read_json('gdrive/MyDrive/CIS 519 Project/Data/Raw Data/goodreads_book_genres_initial.json.gz',lines=True)
genre_created.head()
We then clean this data and process it into a usable format by sorting the genres by the number of tags and convert them into a columns where the 'first' column has the genre most often tagged, the 'second' column has the genre tagged the second most, etc. We also want to explore different genre conventions, so we add in the 'genre_cleaned' column which uses the second most common tag in place of the first if the first tag is 'fiction'. This is done in an attempt to make the genres more specific. In future parts of the project, models are trained against the true most common genre ("first") as well as this created columns.
def cleanGenreList(originalList):
out = sorted(dict(originalList['genres']), key=originalList['genres'].get, reverse=True)
if len(out) == 0:
out = ['','','']
elif len(out) == 1:
out.append('')
out.append('')
elif len(out) == 2:
out.append('')
else:
out = out[:3]
return out
genre_created_cleaned = genre_created.copy()
genre_created_cleaned[['first', 'second', 'third']] = genre_created.apply(lambda x: cleanGenreList(x), axis=1, result_type='expand')
genre_created_cleaned.head()
genre_lookup[['first','book_id']].groupby(by=['first']).count()
genre_created_cleaned.to_csv('gdrive/MyDrive/CIS 519 Project/Data/genre_lookup_cleaned.csv')
#from google.colab import drive
#drive.mount('/content/gdrive')
#import pandas as pd
#genre_lookup = pd.read_csv('gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Data/Raw Data/OLD_genre_lookup_cleaned.csv')
Next we need to join these files together
df_w_genre = df.merge(genre_lookup, how="inner", on="book_id")
df_w_genre = df_w_genre.drop(columns=['Unnamed: 0', 'genres'])
df_w_genre.head()
df_w_genre.to_csv('gdrive/MyDrive/CIS 519 Project/Data/books_with_genre.csv.gz', compression='gzip')
Closer inspection of the data shows that some rows missing the language tags are in a language other than English. As the scope of our project only extends to English language books, we need to identify and remove them.
#!pip install langdetect
from langdetect import detect
def detectLang(x):
if x != x:
return ''
elif type(x) == str:
try:
return detect(x)
except:
print('') #if the description cannot be read by the language detection package, skip it. Most of these are only punctuation or links
else:
print(type(x))
df_w_genre['detected_lang'] = df_w_genre.loc[(df_w_genre['language_code'] != df_w_genre['language_code']) & \
(df_w_genre['description'] == df_w_genre['description']) & \
(~df_w_genre['description'].isin(['<>', '<', '>', '.', ','])), 'description']\
.apply(detectLang)
Now that all the rows in question have a language assigned, what do they look like?
from collections import Counter
Counter(df_w_genre['detected_lang'])
Most of the nans in this list were values that were assigned English by the dataset and thus skipped in the last step. The values of None are the values that were exceptions in the language assignment code so we want to remove those.
df_w_genre = df_w_genre[~df_w_genre['detected_lang'].isin([None])]
df_w_genre = df_w_genre[df_w_genre['first'] == df['first']]
#!pip install langdetect
import csv
from google.colab import drive
import pandas as pd
import numpy as np
from collections import Counter
import joblib
from wordcloud import WordCloud
import matplotlib.pyplot as plt
#drive.mount('/content/gdrive')
df = pd.read_csv('gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Data/books_clean.csv.gz', compression='gzip')
Before we start processing the titles, we need to remove some of the punctuation that we don't want to separate words. For example, in the book title "10,000 Leagues Under the Sea", we want the unigram representation to be "10000, leagues, under, the, sea" not "10, 000, leagues, under, the, sea".
We also don't want words to split on apostrophes--for example, "Charlotte's Web" should be "charlottes, web" not "charlotte, s, web".
df = df[df['title'] == df['title']]
df['title'] = df['title'].str.replace(',','')
df['title'] = df['title'].str.replace('\'','')
Next we can visualize common words in each genre
df_word_cloud = df[['title', 'first']].drop_duplicates()
for genre in df['first'].unique():
print(genre,": ")
text = df_word_cloud[df_word_cloud['first'] == genre]['title'].values
wordcloud = WordCloud().generate(str(' '.join(text)))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
As seen in the chart below, our data is fairly unbalanced. In order to improve our models, we can experiment with oversampling and undersampling. During the testing phase we found that oversampling worked the best, so the models below are trained with the balanced dataset.
class_size = dict(Counter(df['first']))
plt.bar(class_size.keys(), class_size.values())
plt.xticks(rotation=90)
min_val = round(min(class_size.values()) * 1.5)
undersampling_list = []
for i in df['first'].unique():
if len(df[df['first'] == i]) < min_val:
undersampling_list.extend(df[df['first'] == i]['book_id'])
else:
undersampling_list.extend(df[df['first'] == i].sample(min_val)['book_id'])
df_under = df[df['book_id'].isin(undersampling_list)]
max_val = max(class_size.values())
oversampling_list = []
for i in df['first'].unique():
oversampling_list.extend(df[df['first'] == i].sample(max_val, replace=True)['book_id'])
df_over = pd.DataFrame({'book_id':oversampling_list})
df_over = df_over.merge(df, on='book_id', how='left')
After balancing our dataset, we need to transform our list of titles into a format that can be more easily used. In this application, we used TF-IDF. When creating this model, we experimented with using different preprocessing steps. In the end, we used the following parameters:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(df_over['title'], df_over['first'], test_size=0.2, random_state=42, stratify=df_over['first'])
tf_idf = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=5, sublinear_tf=True)
X_train_tf = tf_idf.fit_transform(X_train)
X_test_tf = tf_idf.transform(X_test)
print("n_samples: %d, n_features: %d" % X_train_tf.shape)
print("n_samples: %d, n_features: %d" % X_test_tf.shape)
With the parameters set, there are ~380k features in our model. We can see what a slice of them look like here:
tf_idf.get_feature_names_out()[20000:20010]
#joblib.dump(tf_idf, 'gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Models/TFDIF_Model.pkl')
The first model we created in a Naive Bayes classifier with alpha=0.1, indentified through grid search. We can then print out the accuracies achieved with this model.
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(X_train_tf, y_train)
nb_clf.score(X_test_tf, y_test)
pred_bayes = nb_clf.predict(X_test_tf)
print("Train F1 Score: ", metrics.f1_score(y_train, nb_clf.predict(X_train_tf), average='macro'))
print("Test F1 Score: ", metrics.f1_score(y_test, pred_bayes, average='macro'))
print("\nTrain Top 3 Accuracy: ", metrics.top_k_accuracy_score(y_train, nb_clf.predict_log_proba(X_train_tf), k=3))
print("Test Top 3 Accuracy: ", metrics.top_k_accuracy_score(y_test, nb_clf.predict_log_proba(X_test_tf), k=3))
print(metrics.classification_report(y_test, pred_bayes))
#joblib.dump(nb_clf, 'gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Models/Title_NB_Model.pkl')
print("Model Saved")
The second model is a Logistic Regression model which achieved a higher accuracy than the naive bayes model. The breakout by genre also reveals very similar strengths and weaknesses.
from sklearn import linear_model
lr_clf = linear_model.LogisticRegression(solver='sag',max_iter=200,random_state=450)
lr_clf.fit(X_train_tf, y_train)
pred_lr = lr_clf.predict(X_test_tf)
print("Train F1 Score: ", metrics.f1_score(y_train, lr_clf.predict(X_train_tf), average='macro'))
print("Test F1 Score: ", metrics.f1_score(y_test, pred_lr, average='macro'))
print("\nTrain Top 3 Accuracy: ", metrics.top_k_accuracy_score(y_train, lr_clf.predict_log_proba(X_train_tf), k=3))
print("Test Top 3 Accuracy: ", metrics.top_k_accuracy_score(y_test, lr_clf.predict_log_proba(X_test_tf), k=3))
print(metrics.classification_report(y_test, pred_lr))
#joblib.dump(lr_clf, 'gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Models/Title_LR_Model.pkl')
print("Model Saved")
Now that we have 2 models created to classify our titles, we can do a little experimentation of the models to see what the outputs look like. First, we can predict against some of my favorite books that exist in the dataset to see how they perform.
for book in ["speaker for the dead", "the things they carried", "the anybodies", "the picture of dorian gray"]:
book_vectorized = tf_idf_model.transform([book])
probs = NB_model.predict_log_proba(book_vectorized)[0]
probs_ordered = np.sort(probs)
print("\nName: ", book)
print("Best Guess: ", NB_model.predict(book_vectorized)[0])
for i in range(len(NB_model.classes_)):
print(NB_model.classes_[i], ": ", probs[i])
The first book I chose is "Speaker for the Dead", a science fiction book by Orson Scott Card. The model correctly identifed the book as fiction with the second highest guess being "fantasy, paranormal". Considering how close sci-fi and fantasy are, I would call that a good guess.
The second book is "The Things They Carried" by Tim O'Brien. I would categorize the book as historical fiction and the model identified it as fiction. The model did not perform as well on this title; the correct genre is listed as the 3rd from the bottom in terms of likelihood. Given how "historical fiction" and "fiction" aren't necessarily mutually exclusive, this is an understandable and expected mistake.
The third book is "The Anybodies" by NE Bode, a childrens fantasy book (that happened to be my favorite book growing up). This was correctly identifed as a childrens book, although it looks like that might have just been lucky as it was an 8-way tie for first, likely because the word "anybodies" is not a word that appears often in the dataset.
The last book is "The Picture of Dorian Gray" by Oscar Wilde which was overwhelming predicted as fiction, the correct genre.
Now we can experiment with some books off Allison's shelf that were not present in the dataset at all:
for book in ['atlas of the national parks', "the complete idiots guide to socially responsible investing", "Havana Bay"]:
if len(df[df['title'].str.lower().str.contains(book)]['title'].drop_duplicates()) == 0:
book_vectorized = tf_idf_model.transform([book])
probs = NB_model.predict_log_proba(book_vectorized)[0]
probs_ordered = np.sort(probs)
print("\nName: ", book)
print("Best Guess: ", NB_model.predict(book_vectorized)[0])
for i in range(len(NB_model.classes_)):
print(NB_model.classes_[i], ": ", probs[i])
else:
print("------Book in Dataset------")
All of these books were correctly identified! The first two are non-fiction books of various types and the last is a crime novel by Martin Cruz Smith.
While exploring the predictions the model made regarding some books, I began to wonder what assumptions the model had made regarding gendered names. I took the 50 most common baby names of the past century and predicted against them as if they were novel titles: https://www.ssa.gov/oact/babynames/decades/century.html
womens_names = ["Mary","Patricia","Jennifer","Linda","Elizabeth","Barbara","Susan",
"Jessica","Sarah","Karen","Nancy","Lisa","Betty","Margaret","Sandra",
"Ashley","Kimberly","Emily","Donna","Michelle","Dorothy","Carol","Amanda",
"Melissa","Deborah","Stephanie","Rebecca","Sharon","Laura","Cynthia",
"Kathleen","Amy","Shirley","Angela","Helen","Anna","Brenda","Pamela",
"Nicole","Emma","Samantha","Katherine","Christine","Debra","Rachel",
"Catherine","Carolyn","Janet","Ruth","Maria"]
mens_names = ["James","Robert","John","Michael","William","David","Richard","Joseph",
"Thomas","Charles","Christopher","Daniel","Matthew","Anthony","Mark",
"Donald","Steven","Paul","Andrew","Joshua","Kenneth","Kevin","Brian",
"George","Edward","Ronald","Timothy","Jason","Jeffrey","Ryan","Jacob",
"Gary","Nicholas","Eric","Jonathan","Stephen","Larry","Justin","Scott",
"Brandon","Benjamin","Samuel","Gregory","Frank","Alexander","Raymond",
"Patrick","Jack","Dennis","Jerry"]
women_names_vectors = tf_idf.transform(womens_names)
women_predicted = nb_clf.predict(women_names_vectors)
Counter(women_predicted)
men_names_vectors = tf_idf.transform(mens_names)
men_predicted = nb_clf.predict(men_names_vectors)
Counter(men_predicted)
The traditionally male names are overwhelmingly more likely to be associated with "history, historical fiction, biography" while the traditionally female names are more likely to be associated with "mystery, thriller, crime". The fact that male names are likely to be associated with historical figures is not suprising, while we found it interesting that female names are associated with mystery novels
tf_idf_model = joblib.load('gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Models/TFDIF_Model.pkl')
NB_model = joblib.load('gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Models/Title_NB_Model.pkl')
logreg_model = joblib.load('gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Models/Title_LR_Model.pkl')
df_to_predict = df[['title','book_id','first']].drop_duplicates()
to_predict = tf_idf_model.transform(df_to_predict['title'])
nb_pred = NB_model.predict(to_predict)
logreg_pred = logreg_model.predict(to_predict)
df_to_predict['NB Predictions'] = nb_pred
df_to_predict['LogReg Predictions'] = logreg_pred
df_to_predict.to_csv('gdrive/MyDrive/Grad School/Spring 2022/CIS 519/CIS 519 Project/Final_Data/book_title_predictions.csv')
df[df['description'] == df['description']].shape