import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import re
from nltk.tokenize import word_tokenize as wt
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

spam = pd.read_csv('spam.csv',  encoding='latin-1')
spam.head(10)

spam = spam.filter(['v1','v2'], axis=1)
spam.columns = ['label', 'text']
spam

spam_data = spam[spam['label'] == 'spam']
real_data = spam[spam['label'] == 'ham']

Let's stem and then lemmatize the bag of words. According to Stanford NLP Group,

"Stemming usually refers to a crude heuristic process that chops off the ends of

words in the hope of achieving this goal correctly most of the time, and often includes

the removal of derivational affixes. Lemmatization usually refers to doing things

properly with the use of a vocabulary and morphological analysis of words, normally

aiming to remove inflectional endings only and to return the base or dictionary

form of a word, which is known as the lemma."

Here the stemming is applied to all data, spam data and real data separately

all_data_stem = []
spam_data_stem = []
real_data_stem = []
stemmer = PorterStemmer()
for i in range(spam.shape[0]):
    sms = spam.iloc[i, 1]
    sms = re.sub('[^A-Za-z]', ' ', sms)
    sms = sms.lower()
    tokenized_sms = wt(sms)
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(stemmer.stem(word))
    sms_text = " ".join(sms_processed)
    all_data_stem.append(sms_text)

for i in range(spam_data.shape[0]):
    sms = spam_data.iloc[i, 1]
    sms = re.sub('[^A-Za-z]', ' ', sms)
    sms = sms.lower()
    tokenized_sms = wt(sms)
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(stemmer.stem(word))
    sms_text = " ".join(sms_processed)
    spam_data_stem.append(sms_text)

for i in range(real_data.shape[0]):
    sms = real_data.iloc[i, 1]
    sms = re.sub('[^A-Za-z]', ' ', sms)
    sms = sms.lower()
    tokenized_sms = wt(sms)
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(stemmer.stem(word))
    sms_text = " ".join(sms_processed)
    real_data_stem.append(sms_text)
all_data_stem[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

Here the lemmatizer is applied to all data, spam data and real data separately

all_data_lemma = []
spam_data_lemma = []
real_data_lemma = []
lemmatizer = WordNetLemmatizer()
for i in range(spam.shape[0]):
    sms = spam.iloc[i, 1]
    sms = re.sub('[^A-Za-z]', ' ', sms)
    sms = sms.lower()
    tokenized_sms = wt(sms)
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(lemmatizer.lemmatize(word))
    sms_text = " ".join(sms_processed)
    all_data_lemma.append(sms_text)
    
for i in range(spam_data.shape[0]):
    sms = spam_data.iloc[i, 1]
    sms = re.sub('[^A-Za-z]', ' ', sms)
    sms = sms.lower()
    tokenized_sms = wt(sms)
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(lemmatizer.lemmatize(word))
    sms_text = " ".join(sms_processed)
    spam_data_lemma.append(sms_text)
    
for i in range(real_data.shape[0]):
    sms = real_data.iloc[i, 1]
    sms = re.sub('[^A-Za-z]', ' ', sms)
    sms = sms.lower()
    tokenized_sms = wt(sms)
    sms_processed = []
    for word in tokenized_sms:
        if word not in set(stopwords.words('english')):
            sms_processed.append(lemmatizer.lemmatize(word))
    sms_text = " ".join(sms_processed)
    real_data_lemma.append(sms_text)
all_data_lemma[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

As it is obvious from the first data that stem and lemma works in different ways. For example, the word "availability" has stem of "avail" but lemma of "available"

spam.drop_duplicates(inplace = True)
spam

spam.isnull().sum()

label    0
text     0
dtype: int64

Label with numerical values

spam['num_label'] = spam['label'].map({'ham': 0, 'spam': 1})
spam.head()

Let's create wordcloud to see the most frequent words

spam_words = ' '.join(list(spam[spam['num_label'] == 1]['text']))
spam_wc = WordCloud(width = 600,height = 512).generate(spam_words)
plt.figure(figsize = (12, 8), facecolor = 'k')
plt.imshow(spam_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

real_words = ' '.join(list(spam[spam['num_label'] == 0]['text']))
real_wc = WordCloud(width = 600,height = 512).generate(real_words)
plt.figure(figsize = (12, 8), facecolor = 'k')
plt.imshow(real_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

from nltk import FreqDist
spam_token = nltk.tokenize.word_tokenize(spam_words)
spam_freq = FreqDist(spam_token)
spam_freq

FreqDist({'.': 1004, 'to': 608, '!': 542, ',': 371, 'a': 358, 'you': 189, 'call': 187, 'your': 187, 'or': 185, '&': 178, ...})

spam_freq.most_common(5)

[('.', 1004), ('to', 608), ('!', 542), (',', 371), ('a', 358)]

most common parts in the spam emails

FreqDist(spam_data_lemma).most_common(5)

[('private account statement show un redeemed point call identifier code expires',
  9),
 ('u secret admirer looking make contact u find r reveal think ur special call',
  6),
 ('urgent trying contact u today draw show prize guaranteed call land line claim valid hr',
  5),
 ('please call customer service representative freephone pm guaranteed cash prize',
  4),
 ('free st week nokia tone ur mob every week txt nokia get txting tell ur mate www getzed co uk pobox w wq norm p tone',
  4)]

most common parts in the real emails

FreqDist(real_data_lemma).most_common(5)

[('sorry call later', 30),
 ('ok', 20),
 ('cant pick phone right pls send message', 12),
 ('', 8),
 ('okie', 7)]

Pass the tokenized words to obtain a dispersion plot in the text

text1 = nltk.Text(spam_token)
text1.dispersion_plot(['free','private','account','contact'])

create a model for the ML

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(all_data_lemma).toarray()
y = spam.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

Let's try Naive Bayes Classifier

from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)

The report shows that the model is performing well on detecting hams but bad on spams.

The precision for spams is ~0.38 indicating lots of false positives obtained from the model.

Although the model accuracy is 0.79, it might be misleading,

The recall for spam is high whereas the precision is low. This indicates that the model

is biased towards spams. It is able to correctly identify all spams but also wrongly identified

some hams as spams

print(rep)

              precision    recall  f1-score   support

         ham       0.98      0.77      0.86       968
        spam       0.38      0.92      0.53       147

    accuracy                           0.79      1115
   macro avg       0.68      0.84      0.70      1115
weighted avg       0.90      0.79      0.82      1115

The confusion matrix also shows the similar scenario. The diagonal line does not have the

highest numbers. This means that the performance is not good enough for Naive Bayes.

cm

array([[744, 224],
       [ 12, 135]], dtype=int64)

Let's try Support Vector Classification

from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train, y_train)

SVC()

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)

The model performance is good. Precision and recall for both ham and spam is high

print(rep)

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       968
        spam       0.98      0.86      0.92       147

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Let's try Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

RandomForestClassifier()

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
rep = classification_report(y_test, y_pred)

The model performance is also good. Precision and rcall for both ham and spam is high

print(rep)

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       968
        spam       1.00      0.90      0.95       147

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

	v1	v2	Unnamed: 2	Unnamed: 3	Unnamed: 4
0	ham	Go until jurong point, crazy.. Available only ...	NaN	NaN	NaN
1	ham	Ok lar... Joking wif u oni...	NaN	NaN	NaN
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...	NaN	NaN	NaN
3	ham	U dun say so early hor... U c already then say...	NaN	NaN	NaN
4	ham	Nah I don't think he goes to usf, he lives aro...	NaN	NaN	NaN
5	spam	FreeMsg Hey there darling it's been 3 week's n...	NaN	NaN	NaN
6	ham	Even my brother is not like to speak with me. ...	NaN	NaN	NaN
7	ham	As per your request 'Melle Melle (Oru Minnamin...	NaN	NaN	NaN
8	spam	WINNER!! As a valued network customer you have...	NaN	NaN	NaN
9	spam	Had your mobile 11 months or more? U R entitle...	NaN	NaN	NaN