- The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.
- The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
url = "https://datascienceschools.github.io/Machine_Learning/Classification_Models_CaseStudies/emails.csv"
df = pd.read_csv(url)
df.head()
spam = df[df['spam'] == 1]
ham = df[df['spam'] == 0]
print("Total Emails =", len(df))
print("\nSpams =", len(spam))
print("Percentage of Spams = {:.2f} %".format(1.*len(spam)/len(df)*100.0))
print("\nHam =", len(ham))
print("Percentage of hams = {:.2f} %".format(1.*len(ham)/len(df)*100.0))
sns.countplot(df['spam'], palette= 'Set1')
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
text_verctorized = vectorizer.fit_transform(df['text'])
text = pd.DataFrame(text_verctorized.toarray())
text.head()
df = df.drop('text', axis=1)
df = pd.concat([text, df], axis=1)
df.head()
- print(vectorizer.get_feature_names())
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 3)
- No need to scale data
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is: {:.2f} %".format(accuracy*100))
sns.heatmap(cm, annot=True, fmt='d')
plt.show()
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))