- Dataset consists of 3000 Amazon customer reviews, star ratings, date of review, variant and feedback of various amazon Alexa products like Alexa Echo, Echo dots.
- The objective is to discover insights into consumer reviews and perfrom sentiment analysis on the data.
Dr. Ryan @STEMplicity
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
url = "https://datascienceschools.github.io/Machine_Learning/Classification_Models_CaseStudies/amazon_alexa.tsv"
df = pd.read_csv(url, sep ='\t')
df.head()
positive = df[df['feedback'] == 1]
negative = df[df['feedback'] == 0]
print("Total Feedback =", len(df))
print("\nPositive Feedback =", len(positive))
print("Percentage of Positive Feedback = {:.2f} %".format(1.*len(positive)/len(df)*100.0))
print("\nNegative Feedback =", len(negative))
print("Percentage of Negative Feedback = {:.2f} %".format(1.*len(negative)/len(df)*100.0))
f, ax = plt.subplots(2,2 ,figsize = (40,20))
f00 = sns.countplot(df['feedback'], palette = 'Set1', ax=ax[0,0])
f01 = sns.countplot(df['rating'], palette = 'Set1', ax=ax[0,1])
ax[1,0].hist(df['rating'], color = 'purple', bins=4)
sns.barplot(df['variation'], df['rating'], palette = 'deep', ax=ax[1,1])
plt.xticks(rotation=45)
plt.show()
df.drop(['date', 'rating'], axis = 1, inplace =True)
df.head()
variation_dummies = pd.get_dummies(df['variation'], drop_first=True)
variation_dummies.head()
- No longer need 'variation' column, let's drop it
df.drop('variation', axis =1 , inplace=True)
df = pd.concat([variation_dummies, df], axis =1 )
df.head()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
verified_reviews_verctorized = vectorizer.fit_transform(df['verified_reviews'])
verified_reviews = pd.DataFrame(verified_reviews_verctorized.toarray())
verified_reviews.head()
- No longer need 'verified_reviews' column, let's drop it
df = df.drop('verified_reviews', axis=1)
df = pd.concat([verified_reviews, df], axis=1)
df.head()
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 9)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is: {:.2f} %".format(accuracy*100))
sns.heatmap(cm, annot=True, fmt='d')
plt.show()
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
- Use Random Forest classification algorithm to get higher accuracy
- Random forests are
- a strong modeling technique
- much more robust than a single decision tree
- They aggregate many decision trees to limit overfitting as well as error
due to bias and therefore yield useful results