Spam email classification using Naive Bayes, SVC and Random Forest
Here we will walk through the stemming and lemmatization procedure for NLP.
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import re
from nltk.tokenize import word_tokenize as wt
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
spam = pd.read_csv('spam.csv', encoding='latin-1')
spam.head(10)
spam = spam.filter(['v1','v2'], axis=1)
spam.columns = ['label', 'text']
spam
spam_data = spam[spam['label'] == 'spam']
real_data = spam[spam['label'] == 'ham']
Let's stem and then lemmatize the bag of words. According to Stanford NLP Group,
"Stemming usually refers to a crude heuristic process that chops off the ends of
words in the hope of achieving this goal correctly most of the time, and often includes
the removal of derivational affixes. Lemmatization usually refers to doing things
properly with the use of a vocabulary and morphological analysis of words, normally
aiming to remove inflectional endings only and to return the base or dictionary
form of a word, which is known as the lemma."
all_data_stem = []
spam_data_stem = []
real_data_stem = []
stemmer = PorterStemmer()
for i in range(spam.shape[0]):
sms = spam.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(stemmer.stem(word))
sms_text = " ".join(sms_processed)
all_data_stem.append(sms_text)
for i in range(spam_data.shape[0]):
sms = spam_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(stemmer.stem(word))
sms_text = " ".join(sms_processed)
spam_data_stem.append(sms_text)
for i in range(real_data.shape[0]):
sms = real_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(stemmer.stem(word))
sms_text = " ".join(sms_processed)
real_data_stem.append(sms_text)
all_data_stem[0]
all_data_lemma = []
spam_data_lemma = []
real_data_lemma = []
lemmatizer = WordNetLemmatizer()
for i in range(spam.shape[0]):
sms = spam.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(lemmatizer.lemmatize(word))
sms_text = " ".join(sms_processed)
all_data_lemma.append(sms_text)
for i in range(spam_data.shape[0]):
sms = spam_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(lemmatizer.lemmatize(word))
sms_text = " ".join(sms_processed)
spam_data_lemma.append(sms_text)
for i in range(real_data.shape[0]):
sms = real_data.iloc[i, 1]
sms = re.sub('[^A-Za-z]', ' ', sms)
sms = sms.lower()
tokenized_sms = wt(sms)
sms_processed = []
for word in tokenized_sms:
if word not in set(stopwords.words('english')):
sms_processed.append(lemmatizer.lemmatize(word))
sms_text = " ".join(sms_processed)
real_data_lemma.append(sms_text)
all_data_lemma[0]
spam.drop_duplicates(inplace = True)
spam
spam.isnull().sum()
spam['num_label'] = spam['label'].map({'ham': 0, 'spam': 1})
spam.head()
spam_words = ' '.join(list(spam[spam['num_label'] == 1]['text']))
spam_wc = WordCloud(width = 600,height = 512).generate(spam_words)
plt.figure(figsize = (12, 8), facecolor = 'k')
plt.imshow(spam_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
real_words = ' '.join(list(spam[spam['num_label'] == 0]['text']))
real_wc = WordCloud(width = 600,height = 512).generate(real_words)
plt.figure(figsize = (12, 8), facecolor = 'k')
plt.imshow(real_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
from nltk import FreqDist
spam_token = nltk.tokenize.word_tokenize(spam_words)
spam_freq = FreqDist(spam_token)
spam_freq
spam_freq.most_common(5)
FreqDist(spam_data_lemma).most_common(5)
FreqDist(real_data_lemma).most_common(5)
text1 = nltk.Text(spam_token)
text1.dispersion_plot(['free','private','account','contact'])
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(all_data_lemma).toarray()
y = spam.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)
The report shows that the model is performing well on detecting hams but bad on spams.
The precision for spams is ~0.38 indicating lots of false positives obtained from the model.
Although the model accuracy is 0.79, it might be misleading,
The recall for spam is high whereas the precision is low. This indicates that the model
is biased towards spams. It is able to correctly identify all spams but also wrongly identified
some hams as spams
print(rep)
cm
from sklearn.svm import SVC
classifier = SVC()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)
print(rep)
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)
print(rep)