Data Analysis with Python(Part 4) ~ Learning help you to achieve your biggest Goals

Data Analysis with Python(Part 4)

Model Evaluation

An important step in testing your model is to split your data into training and testing data.
Sometimes you do not have sufficient testing data; as a result, you may want to perform Cross-validation. Let's go over several methods that you can use for Cross-validation.

You can also use the function 'cross_val_predict' to predict the output. The function splits up the data into the specified number of folds, using one fold to get a prediction while the rest of the folds are used as test data.

Overfitting, Underfitting and Model Selection

It turns out that the test data sometimes referred to as the out of sample data is a much better measure of how well your model performs in the real world. One reason for this is overfitting

Overfitting
Overfitting occurs when the model fits the noise, not the underlying process. Therefore when testing your model using the test-set, your model does not perform as well as it is modelling noise, not the underlying process that generated the relationship.

The lower the R^2, the worse the model, a Negative R^2 is a sign of overfitting.

Ridge regression

In Ridge Regression we will see how the parameter Alfa changes the model.

Grid Search

The term Alfa is a hyperparameter, sklearn has the class GridSearchCV to make the process of finding the best hyperparameter simple

import pandas as pd

import numpy as np

from test.test_functools import capture

import matplotlib.pyplot as plt

import seaborn as sns

#%matplotlib inline

'exec(%matplotlib inline)'

#load data and store in dataframe df:

path="C:/Users/thakudev/PYTHON/Data/module_5_auto.csv"

df=pd.read_csv(path)

#print(df.head())

df.to_csv("module_5_auto.csv")

df=df._get_numeric_data()

print(df.head())

#Libraries for plotting

from IPython.display import display

from ipywidgets import widgets

from IPython.display import display

from ipywidgets import interact, interactive, fixed, interact_manual

#Functions for plotting

def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):

width = 12

height = 10

plt.figure(figsize=(width, height))

ax1 = sns.distplot(RedFunction, hist=False, color="r", label=RedName)

ax2 = sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)

plt.title(Title)

plt.xlabel('Price (in dollars)')

plt.ylabel('Proportion of Cars')

plt.show()

plt.close()

def PollyPlot(xtrain, xtest, y_train, y_test, lr,poly_transform):

width = 12

height = 10

plt.figure(figsize=(width, height))

#training data

#testing data

# lr: linear regression object

#poly_transform: polynomial transformation object

xmax=max([xtrain.values.max(), xtest.values.max()])

xmin=min([xtrain.values.min(), xtest.values.min()])

x=np.arange(xmin, xmax, 0.1)

plt.plot(xtrain, y_train, 'ro', label='Training Data')

plt.plot(xtest, y_test, 'go', label='Test Data')

plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')

plt.ylim([-10000, 60000])

plt.ylabel('Price')

plt.legend()

#An important step in testing your model is to split your data into training and testing data.

y_data=df['price']

x_data=df.drop('price',axis=1)

#we randomly split our data into training and testing data using the function

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.15, random_state=1)

print("Number of test samples:",x_test.shape[0])

print("Number of Training ",x_train.shape[0])

from sklearn.linear_model import LinearRegression

lre=LinearRegression()

print("1:",lre.fit(x_train[['horsepower']],y_train))

#cal R^2 on the test data

print("2:",lre.score(x_test[['horsepower']],y_test))

print("3:",lre.score(x_train[['horsepower']],y_train))

#cross_val_score.

from sklearn.model_selection import cross_val_score

Rcross=cross_val_score(lre, x_data[['horsepower']],y_data,cv=4)

print("Rcross",Rcross)

print("The mean of the folds are",Rcross.mean(),"and the standard deviation is",Rcross.std())

-1 * cross_val_score(lre,x_data[['horsepower']], y_data,cv=4,scoring='neg_mean_squared_error')

from sklearn.model_selection import cross_val_predict

yhat = cross_val_predict(lre,x_data[['horsepower']], y_data,cv=4)

yhat[0:5]

#create Multiple linear regression objects and train the model

lr = LinearRegression()

lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train)

#Prediction using training data:

yhat_train = lr.predict(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])

print("yhat_train",yhat_train[0:5])

#Prediction using test data:

yhat_test = lr.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])

print("yhat_test",yhat_test[0:5])

Title = 'Distribution Plot of Predicted Value Using Training Data vs Training Data Distribution'

DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", Title)

Title='Distribution Plot of Predicted Value Using Test Data vs Data Distribution of Test Data'

DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values (Test)",Title)

#Overfitting

from sklearn.preprocessing import PolynomialFeatures

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=0)

pr = PolynomialFeatures(degree=5)

x_train_pr = pr.fit_transform(x_train[['horsepower']])

x_test_pr = pr.fit_transform(x_test[['horsepower']])

print(pr)

#create a linear regression model "poly" and train it

poly = LinearRegression()

poly.fit(x_train_pr, y_train)

yhat = poly.predict(x_test_pr)

yhat[0:5]

print("Predicted values:", yhat[0:4])

print("True values:", y_test[0:4].values)

PollyPlot(x_train[['horsepower']], x_test[['horsepower']], y_train, y_test, poly,pr)

#R^2 of the training data:

poly.score(x_train_pr, y_train)

#R^2 of the test data:

poly.score(x_test_pr, y_test)

#Ridge regression

pr=PolynomialFeatures(degree=2)

x_train_pr=pr.fit_transform(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])

x_test_pr=pr.fit_transform(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])

from sklearn.linear_model import Ridge

#create a Ridge regression object, setting the regularization parameter to 0.1

RigeModel=Ridge(alpha=0.1)

RigeModel.fit(x_train_pr, y_train)

RigeModel.score(x_test_pr, y_test)

yhat = RigeModel.predict(x_test_pr)

print('predicted:', yhat[0:4])

print('test set :', y_test[0:4].values)

#Grid Search

from sklearn.model_selection import GridSearchCV

parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000],'normalize':[True,False]}]

print(parameters1)

RR=Ridge()

#Create a ridge grid search object

Grid1=GridSearchCV(RR,parameters1,cv=4)

#Fit the model

Grid1.fit(x_data[['horsepower','curb-weight','engine-size','highway-mpg']],y_data)

BestRR=Grid1.best_estimator_

print(BestRR)

# test our model on the test data

print(BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_test))

1 comment:

UnknownDecember 5, 2021 at 12:51 PM
Hii, I have a doubt

can you please tell me why we choose yhat[0:5] in the code below as according to my understandin if whe are using cv 4 then yhat should give 4 values please let me know if i have understood something wrong.

yhat = cross_val_predict(lre,x_data[['horsepower']], y_data,cv=4)
yhat[0:5]

Learning help you to achieve your biggest Goals

Tuesday, April 23, 2019