Spam email classification in Tensorflow
Here we will walk through the spam emails classification using Tensorflow. Tensorflow provides wide variety of controls in processing texts.
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 1000
embedding_dim = 64
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
spam = pd.read_csv('spam.csv', encoding='latin-1')
spam = spam.filter(['v1','v2'], axis=1)
spam.columns = ['label', 'text']
spam
sentences = []
labels = []
for i in range(0,spam.shape[0],1):
sentences.append(spam['text'][i])
labels.append(spam['label'][i])
training_size = int(spam.shape[0]*0.8)
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels_str = labels[0:training_size]
testing_labels_str = labels[training_size:]
training_labels = [0] * len(training_labels_str)
for ind,item in enumerate(training_labels_str):
if item == 'ham':
training_labels[ind] = 1
else:
training_labels[ind] = 0
testing_labels = [0] * len(testing_labels_str)
for ind,item in enumerate(testing_labels_str):
if item == 'ham':
testing_labels[ind] = 1
else:
testing_labels[ind] = 0
training_padded = np.array(training_padded)
testing_padded = np.array(testing_padded)
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dropout(0.25),
tf.keras.layers.Dense(1, activation='sigmoid')
])
adam = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])
model.summary()
num_epochs = 20
history = model.fit(training_padded, training_labels, epochs=num_epochs, \
validation_data=(testing_padded, testing_labels), verbose=1)
import matplotlib.pyplot as plt
def plot_graphs(history, string):
plt.plot(history.history[string])
plt.plot(history.history['val_'+string])
plt.xlabel("Epochs")
plt.ylabel(string)
plt.legend([string, 'val_'+string])
plt.show()
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')
X_train = training_padded
X_test = testing_padded
y_train = training_labels
y_test = testing_labels
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred = model.predict(X_test)
# Convert predicted values to either 0 or 1
y_prediction = [0] * y_pred.shape[0]
for ind,item in enumerate(y_pred):
if item > 0.5:
y_prediction[ind] = 1
else:
y_prediction[ind] = 0
rep = classification_report(y_test, y_prediction)
print(rep)
sample_text = ["Winner!!! Darling please click the link to claim your free prize"]
sample_text_tokenized = tokenizer.texts_to_sequences(sample_text)
sample_text_tokenized_padded = pad_sequences(sample_text_tokenized, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# 0 for spam and 1 for real
pred = float(model.predict(sample_text_tokenized_padded))
if (pred>0.5):
print ("This is a real email")
else:
print("This is a spam")