import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


url = "https://datascienceschools.github.io//Machine_Learning/Sklearn/Case_Study/Startups/50_Startups.csv"

df = pd.read_csv(url)

df.head()


X = df.iloc[:, :-1].values

y = df.iloc[:, -1].values


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')

X = np.array(ct.fit_transform(X))

X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58, 282574.31],
       [0.0, 1.0, 0.0, 91749.16, 114175.79, 294919.57],
       [0.0, 0.0, 1.0, 86419.7, 153514.11, 0.0],
       [1.0, 0.0, 0.0, 76253.86, 113867.3, 298664.47],
       [0.0, 0.0, 1.0, 78389.47, 153773.43, 299737.29],
       [0.0, 1.0, 0.0, 73994.56, 122782.75, 303319.26],
       [0.0, 1.0, 0.0, 67532.53, 105751.03, 304768.73],
       [0.0, 0.0, 1.0, 77044.01, 99281.34, 140574.81],
       [1.0, 0.0, 0.0, 64664.71, 139553.16, 137962.62],
       [0.0, 1.0, 0.0, 75328.87, 144135.98, 134050.07],
       [0.0, 0.0, 1.0, 72107.6, 127864.55, 353183.81],
       [0.0, 1.0, 0.0, 66051.52, 182645.56, 118148.2],
       [0.0, 0.0, 1.0, 65605.48, 153032.06, 107138.38],
       [0.0, 1.0, 0.0, 61994.48, 115641.28, 91131.24],
       [0.0, 0.0, 1.0, 61136.38, 152701.92, 88218.23],
       [1.0, 0.0, 0.0, 63408.86, 129219.61, 46085.25],
       [0.0, 1.0, 0.0, 55493.95, 103057.49, 214634.81],
       [1.0, 0.0, 0.0, 46426.07, 157693.92, 210797.67],
       [0.0, 0.0, 1.0, 46014.02, 85047.44, 205517.64],
       [0.0, 1.0, 0.0, 28663.76, 127056.21, 201126.82],
       [1.0, 0.0, 0.0, 44069.95, 51283.14, 197029.42],
       [0.0, 0.0, 1.0, 20229.59, 65947.93, 185265.1],
       [1.0, 0.0, 0.0, 38558.51, 82982.09, 174999.3],
       [1.0, 0.0, 0.0, 28754.33, 118546.05, 172795.67],
       [0.0, 1.0, 0.0, 27892.92, 84710.77, 164470.71],
       [1.0, 0.0, 0.0, 23640.93, 96189.63, 148001.11],
       [0.0, 0.0, 1.0, 15505.73, 127382.3, 35534.17],
       [1.0, 0.0, 0.0, 22177.74, 154806.14, 28334.72],
       [0.0, 0.0, 1.0, 1000.23, 124153.04, 1903.93],
       [0.0, 1.0, 0.0, 1315.46, 115816.21, 297114.46],
       [1.0, 0.0, 0.0, 0.0, 135426.92, 0.0],
       [0.0, 0.0, 1.0, 542.05, 51743.15, 0.0],
       [1.0, 0.0, 0.0, 0.0, 116983.8, 45173.06]], dtype=object)


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


from sklearn.linear_model import LinearRegression

model = LinearRegression()


model.fit(X_train, y_train)

LinearRegression()


y_pred = model.predict(X_test)


data = pd.DataFrame(X_test).rename(columns={0: "Califoria",
                                            1: "Florida",
                                            2: "New_York",
                                            3: "R&D_Spend",
                                            4: "Administration",
                                            5: "Marketing_Spend"})


data['Predicted_Profit'] = y_pred

data['Real_Profit'] = y_test

data['Difference'] =  y_pred - y_test

data


model.predict([[1, 0, 0, 160000, 130000, 300000]])

array([181566.92389385])


new_data = np.array([
       [0.0, 0.0, 1.0, 165000, 136000, 470000],
       [1.0, 0.0, 0.0, 160000, 150000, 440000],
       [0.0, 1.0, 0.0, 150000, 100000, 400000]])

new_startups = pd.DataFrame(new_data).rename(columns={  0: "Califoria",
                                                        1: "Florida",
                                                        2: "New_York",
                                                        3: "R&D_Spend",
                                                        4: "Administration",
                                                        5: "Marketing_Spend"})


new_startups['predicted_profit'] = model.predict(new_startups)

new_startups


Intercept = model.intercept_

print('Intercept is:', Intercept)

Intercept is: 42467.52924854249


Coefficients = model.coef_

print('Coefficients are:\n\n', Coefficients)

Coefficients are:

 [ 8.66383692e+01 -8.72645791e+02  7.86007422e+02  7.73467193e-01
  3.28845975e-02  3.66100259e-02]

	Califoria	Florida	New_York	R&D_Spend	Administration	Marketing_Spend	Predicted_Profit	Real_Profit	Difference
0	0	1	0	66051.5	182646	118148	103015.201598	103282.38	-267.178402
1	1	0	0	100672	91790.6	249745	132582.277608	144259.40	-11677.122392
2	0	1	0	101913	110594	229161	132447.738452	146121.95	-13674.211548
3	0	1	0	27892.9	84710.8	164471	71976.098513	77798.83	-5822.731487
4	0	1	0	153442	101146	407935	178537.482211	191050.39	-12512.907789
5	0	0	1	72107.6	127865	353184	116161.242302	105008.31	11152.932302
6	0	0	1	20229.6	65947.9	185265	67851.692097	81229.06	-13377.367903
7	0	0	1	61136.4	152702	88218.2	98791.733747	97483.56	1308.173747
8	0	1	0	73994.6	122783	303319	113969.435330	110352.25	3617.185330
9	0	1	0	142107	91391.8	366168	167921.065696	166187.94	1733.125696

Case Study (Startup Profit) :¶

SKLearrn (Multiple Linear Regression)¶

Overview¶

Importing the Relevant Libraries¶

Loading the data¶

Declaring the Dependent and the Independent variables¶

One Hot Encoding the Independent Variable (State)¶

Splitting the dataset into the Training set and Test set¶

Linear Regression Model¶

Creating a Linear Regression¶

Fitting The Model¶

Predicting the Test Set Results¶

Creating a Summary Table (Test Set Results)¶

Making Predictions¶

Making a Single Observation Prediction¶

Making Multiple Observations Prediction¶

Intercept, Coefficients & Final Regression Equation¶

Finding the Intercept (b0)¶

Finding the coefficients (b1, b2, b3, b4, b5, b6)¶

Final Regression Equation (y = b0 + b1 x1 + ... + b6 x6)¶

Data visualization (not possible)¶

	R&D Spend	Administration	Marketing Spend	State	Profit
0	165349.20	136897.80	471784.10	New York	192261.83
1	162597.70	151377.59	443898.53	California	191792.06
2	153441.51	101145.55	407934.54	Florida	191050.39
3	144372.41	118671.85	383199.62	New York	182901.99
4	142107.34	91391.77	366168.42	Florida	166187.94

	Califoria	Florida	New_York	R&D_Spend	Administration	Marketing_Spend	predicted_profit
0	0.0	0.0	1.0	165000.0	136000.0	470000.0	192554.640892
1	1.0	0.0	0.0	160000.0	150000.0	440000.0	187350.019466
2	0.0	1.0	0.0	150000.0	100000.0	400000.0	175547.432467