- Importing the relevant libraries
- Loading data
- Dummy Variables
- Rearranging Columns
- Columns Values
- Reordering Columns
- Save Changes
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()
url = "https://datascienceschools.github.io/Machine_Learning/CaseStudy/LinearRegression/carprice_editted2.csv"
df = pd.read_csv(url)
df.head()
- It is extremely important that we drop one of the dummies
df = pd.get_dummies(df, drop_first=True)
df.head()
- Conventionally, the most intuitive order is:
- Dependent variable
- Indepedendent Numerical Variables
- Dummy Variables
df.columns.values
cols = ['log_price', 'Mileage', 'EngineV', 'Brand_BMW',
'Brand_Mercedes-Benz', 'Brand_Mitsubishi', 'Brand_Renault',
'Brand_Toyota', 'Brand_Volkswagen', 'Body_hatch', 'Body_other',
'Body_sedan', 'Body_vagon', 'Body_van', 'Engine Type_Gas',
'Engine Type_Other', 'Engine Type_Petrol', 'Registration_yes']
df = df[cols]
df.head()
df.to_csv('carprice_editted3.csv', index=False)