Data Analysis with Python(Part 4)
Model Evaluation
An important step in testing your model is to split your data into training and testing data.
Sometimes you do not have sufficient testing data; as a result, you may want to perform Cross-validation. Let's go over several methods that you can use for Cross-validation.
You can also use the function 'cross_val_predict' to predict the output. The function splits up the data into the specified number of folds, using one fold to get a prediction while the rest of the folds are used as test data.
Overfitting, Underfitting and Model Selection
It turns out that the test data sometimes referred to as the out of sample data is a much better measure of how well your model performs in the real world. One reason for this is overfitting
Overfitting
Overfitting occurs when the model fits the noise, not the underlying process. Therefore when testing your model using the test-set, your model does not perform as well as it is modelling noise, not the underlying process that generated the relationship.
The lower the R^2, the worse the model, a Negative R^2 is a sign of overfitting.
Ridge regression
In Ridge Regression we will see how the parameter Alfa changes the model.
Grid Search
The term Alfa is a hyperparameter, sklearn has the class GridSearchCV to make the process of finding the best hyperparameter simple
An important step in testing your model is to split your data into training and testing data.
Sometimes you do not have sufficient testing data; as a result, you may want to perform Cross-validation. Let's go over several methods that you can use for Cross-validation.
You can also use the function 'cross_val_predict' to predict the output. The function splits up the data into the specified number of folds, using one fold to get a prediction while the rest of the folds are used as test data.
Overfitting, Underfitting and Model Selection
It turns out that the test data sometimes referred to as the out of sample data is a much better measure of how well your model performs in the real world. One reason for this is overfitting
Overfitting
Overfitting occurs when the model fits the noise, not the underlying process. Therefore when testing your model using the test-set, your model does not perform as well as it is modelling noise, not the underlying process that generated the relationship.
The lower the R^2, the worse the model, a Negative R^2 is a sign of overfitting.
Ridge regression
In Ridge Regression we will see how the parameter Alfa changes the model.
Grid Search
The term Alfa is a hyperparameter, sklearn has the class GridSearchCV to make the process of finding the best hyperparameter simple
|
import pandas as pd
import numpy as np
from test.test_functools import capture
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
'exec(%matplotlib inline)'
#load data and store in dataframe df:
path="C:/Users/thakudev/PYTHON/Data/module_5_auto.csv"
df=pd.read_csv(path)
#print(df.head())
df.to_csv("module_5_auto.csv")
df=df._get_numeric_data()
print(df.head())
#Libraries for plotting
from IPython.display import display
from ipywidgets import widgets
from IPython.display import display
from ipywidgets import interact, interactive,
fixed, interact_manual
#Functions for plotting
def DistributionPlot(RedFunction,
BlueFunction, RedName, BlueName, Title):
width = 12
height = 10
plt.figure(figsize=(width,
height))
ax1 =
sns.distplot(RedFunction, hist=False, color="r", label=RedName)
ax2 =
sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)
plt.title(Title)
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')
plt.show()
plt.close()
def PollyPlot(xtrain, xtest,
y_train, y_test, lr,poly_transform):
width = 12
height = 10
plt.figure(figsize=(width, height))
#training data
#testing data
# lr: linear
regression object
#poly_transform:
polynomial transformation object
xmax=max([xtrain.values.max(), xtest.values.max()])
xmin=min([xtrain.values.min(), xtest.values.min()])
x=np.arange(xmin,
xmax, 0.1)
plt.plot(xtrain,
y_train, 'ro', label='Training Data')
plt.plot(xtest,
y_test, 'go', label='Test Data')
plt.plot(x,
lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
plt.ylim([-10000, 60000])
plt.ylabel('Price')
plt.legend()
#An important step in testing your model is to split your data
into training and testing data.
y_data=df['price']
x_data=df.drop('price',axis=1)
#we randomly split our data into training and testing data
using the function
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.15, random_state=1)
print("Number
of test samples:",x_test.shape[0])
print("Number
of Training ",x_train.shape[0])
from sklearn.linear_model import LinearRegression
lre=LinearRegression()
print("1:",lre.fit(x_train[['horsepower']],y_train))
#cal R^2 on the test data
print("2:",lre.score(x_test[['horsepower']],y_test))
print("3:",lre.score(x_train[['horsepower']],y_train))
#cross_val_score.
from sklearn.model_selection import cross_val_score
Rcross=cross_val_score(lre, x_data[['horsepower']],y_data,cv=4)
print("Rcross",Rcross)
print("The
mean of the folds are",Rcross.mean(),"and the standard deviation is",Rcross.std())
-1 *
cross_val_score(lre,x_data[['horsepower']], y_data,cv=4,scoring='neg_mean_squared_error')
from sklearn.model_selection import cross_val_predict
yhat = cross_val_predict(lre,x_data[['horsepower']],
y_data,cv=4)
yhat[0:5]
#create Multiple linear regression objects and train the model
lr = LinearRegression()
lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train)
#Prediction using training data:
yhat_train = lr.predict(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print("yhat_train",yhat_train[0:5])
#Prediction using test data:
yhat_test = lr.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print("yhat_test",yhat_test[0:5])
Title = 'Distribution Plot of
Predicted Value Using Training Data vs Training Data
Distribution'
DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values
(Train)", Title)
Title='Distribution Plot of
Predicted Value Using Test Data vs Data Distribution of Test
Data'
DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values
(Test)",Title)
#Overfitting
from sklearn.preprocessing import PolynomialFeatures
x_train, x_test, y_train, y_test = train_test_split(x_data,
y_data, test_size=0.45, random_state=0)
pr = PolynomialFeatures(degree=5)
x_train_pr = pr.fit_transform(x_train[['horsepower']])
x_test_pr = pr.fit_transform(x_test[['horsepower']])
print(pr)
#create a linear regression model "poly" and
train it
poly = LinearRegression()
poly.fit(x_train_pr, y_train)
yhat = poly.predict(x_test_pr)
yhat[0:5]
print("Predicted
values:", yhat[0:4])
print("True
values:", y_test[0:4].values)
PollyPlot(x_train[['horsepower']], x_test[['horsepower']], y_train, y_test, poly,pr)
#R^2 of the training data:
poly.score(x_train_pr, y_train)
#R^2 of the test data:
poly.score(x_test_pr, y_test)
#Ridge regression
pr=PolynomialFeatures(degree=2)
x_train_pr=pr.fit_transform(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])
x_test_pr=pr.fit_transform(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])
from sklearn.linear_model import Ridge
#create a Ridge regression object, setting the regularization
parameter to 0.1
RigeModel=Ridge(alpha=0.1)
RigeModel.fit(x_train_pr, y_train)
RigeModel.score(x_test_pr, y_test)
yhat = RigeModel.predict(x_test_pr)
print('predicted:', yhat[0:4])
print('test
set :', y_test[0:4].values)
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000],'normalize':[True,False]}]
print(parameters1)
RR=Ridge()
RR
#Create a ridge grid search object
Grid1=GridSearchCV(RR,parameters1,cv=4)
#Fit the model
Grid1.fit(x_data[['horsepower','curb-weight','engine-size','highway-mpg']],y_data)
BestRR=Grid1.best_estimator_
print(BestRR)
# test our model on the test data
print(BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_test))
|


