import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


url = "https://datascienceschools.github.io/Machine_Learning/Sklearn/Case_Study/Salary/Salary_Data.csv"

df = pd.read_csv(url)

df.head()


X = df.iloc[:, :-1].values

y = df.iloc[:, -1].values


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)


from sklearn.linear_model import LinearRegression

model = LinearRegression()


model.fit(X_train, y_train)

LinearRegression()


y_pred = model.predict(X_test)


data = pd.DataFrame(X_test).rename(columns={0: "experience_years"})

data['predicted_salary'] = y_pred

data['real_salary'] = y_test

data['difference'] =  y_pred - y_test

data


model.predict([[12]])

array([138967.5015615])


new_employees = pd.DataFrame({'years_of_experience': [0,1,5,10]})

new_employees['predicted_salary'] = model.predict(new_employees)

new_employees


Rsquared = model.score(X_train,y_train)

print('R-Squared is:', Rsquared)

R-Squared is: 0.9381900012894278


Intercept = model.intercept_

print('Intercept is:', Intercept)

Intercept is: 26816.19224403119


Coefficient = model.coef_

print('coefficient is:', Coefficient)

coefficient is: [9345.94244312]


plt.scatter(X_train, y_train)

plt.plot(X_train, model.predict(X_train), color = 'red')

plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')

plt.show()


plt.scatter(X_test, y_test)

plt.plot(X_train, model.predict(X_train), color = 'red')

plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')

plt.show()


plt.scatter(X_train, y_train, color='blue')

plt.scatter(X_test, y_test, color='green')

plt.plot(X_train, model.predict(X_train), color = 'red')

plt.title('Salary vs Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')

plt.show()

	YearsExperience	Salary
0	1.1	39343.0
1	1.3	46205.0
2	1.5	37731.0
3	2.0	43525.0
4	2.2	39891.0

	experience_years	predicted_salary	real_salary	difference
0	1.5	40835.105909	37731.0	3104.105909
1	10.3	123079.399408	122391.0	688.399408
2	4.1	65134.556261	57081.0	8053.556261
3	3.9	63265.367772	63218.0	47.367772
4	9.5	115602.645454	116969.0	-1366.354546
5	8.7	108125.891499	109431.0	-1305.108501
6	9.6	116537.239698	112635.0	3902.239698
7	4.0	64199.962017	55794.0	8405.962017
8	5.3	76349.687193	83088.0	-6738.312807
9	7.9	100649.137545	101302.0	-652.862455

Case Study (Salary) :¶

SKLearrn (Simple Linear Regression)¶

Overview¶

Importing the Relevant Libraries¶

Loading the data¶

Declaring the Dependent and the Independent variables¶

Splitting the dataset into the Training set and Test set¶

Linear Regression Model¶

Creating a Linear Regression¶

Fitting The Model¶

Predicting the Test Set Results¶

Creating a Summary Table (Test Set Results)¶

Making Predictions¶

Making a Single Observation Prediction¶

Making Multiple Observations Prediction¶

R-Squared (R²) , Intercept , Coefficient¶

Calculating the R-Squared (R²)¶

Finding the Intercept (b0)¶

Finding the coefficients (b1)¶

Final Regression Equation (y = b0 + b1 x)¶

Data visualization¶

Visualising the Training Set Results¶

Visualising the Test Set Results¶

Visualising the Train &Test Set Results on the same plot¶

	years_of_experience	predicted_salary
0	0	26816.192244
1	1	36162.134687
2	5	73545.904460
3	10	120275.616675