import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
url = "https://datascienceschools.github.io/Machine_Learning/Feature_Selection/HousePrice_Train.csv"
df = pd.read_csv(url)
df.head()
X = df.drop(['Id','SalePrice'], axis=1)
y = df['SalePrice']
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
ridge = Ridge()
parameters = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
ridge_regressor = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error', cv = 5)
ridge_regressor.fit(X,y)
print(ridge_regressor.best_params_)
print("\nMSE: ", ridge_regressor.best_score_)
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
lasso = Lasso()
parameters = {'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]}
lasso_regressor = GridSearchCV(lasso,parameters,scoring ='neg_mean_squared_error',cv = 5)
lasso_regressor.fit(X,y)
print(lasso_regressor.best_params_, )
print("\nMSE: ", lasso_regressor.best_score_)
- Lasso Regression with lower MSE selected
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(Lasso(alpha = 0.001, random_state = 0))
model.fit(X,y)
selected_features = X.columns[(model.get_support())]
print('Number of Total Features: {}'.format((X.shape[1])))
print('Number of Features Selected: {}'.format(len(selected_features)))
print('Features with Coefficients Shrank to Zero: {}'.format(np.sum(model.estimator_.coef_ == 0)))
print('\nSelected Features:\n\n', selected_features)
X_Final = X[selected_features]
final_data = pd.concat([X_Final,y], axis=1)
final_data.head()
- Data is ready for training machine learning models
final_data.to_csv('HousePrice_Train_Final.csv', index=False)