Tuesday, April 23, 2019

Data Analysis with Python(Part 4)

Data Analysis with Python(Part 4)
 
Model Evaluation

An important step in testing your model is to split your data into training and testing data.
Sometimes you do not have sufficient testing data; as a result, you may want to perform Cross-validation. Let's go over several methods that you can use for Cross-validation.

You can also use the function 'cross_val_predict' to predict the output. The function splits up the data into the specified number of folds, using one fold to get a prediction while the rest of the folds are used as test data.


Overfitting, Underfitting and Model Selection

It turns out that the test data sometimes referred to as the out of sample data is a much better measure of how well your model performs in the real world. One reason for this is overfitting

Overfitting

Overfitting occurs when the model fits the noise, not the underlying process. Therefore when testing your model using the test-set, your model does not perform as well as it is modelling noise, not the underlying process that generated the relationship.

The lower the R^2, the worse the model, a Negative R^2 is a sign of overfitting.

Ridge regression

In Ridge Regression we will see how the parameter Alfa changes the model.

Grid Search

The term Alfa is a hyperparameter, sklearn has the class GridSearchCV to make the process of finding the best hyperparameter simple
 
 
 
import pandas as pd
import numpy as np
from test.test_functools import capture
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
'exec(%matplotlib inline)'


#load data and store in dataframe df:

path="C:/Users/thakudev/PYTHON/Data/module_5_auto.csv"
df=pd.read_csv(path)
#print(df.head())


df.to_csv("module_5_auto.csv")
df=df._get_numeric_data()
print(df.head())

#Libraries for plotting
from IPython.display import display
from ipywidgets import widgets
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual

#Functions for plotting
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))

    ax1 = sns.distplot(RedFunction, hist=False, color="r", label=RedName)
    ax2 = sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cars')

    plt.show()
    plt.close()
   
def PollyPlot(xtrain, xtest, y_train, y_test, lr,poly_transform):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
   
   
    #training data
    #testing data
    # lr:  linear regression object
    #poly_transform:  polynomial transformation object

    xmax=max([xtrain.values.max(), xtest.values.max()])

    xmin=min([xtrain.values.min(), xtest.values.min()])

    x=np.arange(xmin, xmax, 0.1)


    plt.plot(xtrain, y_train, 'ro', label='Training Data')
    plt.plot(xtest, y_test, 'go', label='Test Data')
    plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
    plt.ylim([-10000, 60000])
    plt.ylabel('Price')
    plt.legend()   

#An important step in testing your model is to split your data into training and testing data.
y_data=df['price']
x_data=df.drop('price',axis=1)

#we randomly split our data into training and testing data using the function

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.15, random_state=1)
print("Number of test samples:",x_test.shape[0])
print("Number of Training ",x_train.shape[0])

from sklearn.linear_model import LinearRegression
lre=LinearRegression()
print("1:",lre.fit(x_train[['horsepower']],y_train))
#cal R^2 on the test data
print("2:",lre.score(x_test[['horsepower']],y_test))
print("3:",lre.score(x_train[['horsepower']],y_train))

#cross_val_score.
from sklearn.model_selection import cross_val_score
Rcross=cross_val_score(lre, x_data[['horsepower']],y_data,cv=4)
print("Rcross",Rcross)
print("The mean of the folds are",Rcross.mean(),"and the standard deviation is",Rcross.std())

-1 * cross_val_score(lre,x_data[['horsepower']], y_data,cv=4,scoring='neg_mean_squared_error')

from sklearn.model_selection import cross_val_predict
yhat = cross_val_predict(lre,x_data[['horsepower']], y_data,cv=4)
yhat[0:5]



#create Multiple linear regression objects and train the model

lr = LinearRegression()
lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train)

#Prediction using training data:
yhat_train = lr.predict(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print("yhat_train",yhat_train[0:5])

#Prediction using test data:
yhat_test = lr.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print("yhat_test",yhat_test[0:5])


Title = 'Distribution  Plot of  Predicted Value Using Training Data vs Training Data Distribution'
DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", Title)


Title='Distribution  Plot of  Predicted Value Using Test Data vs Data Distribution of Test Data'
DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values (Test)",Title)

#Overfitting
from sklearn.preprocessing import PolynomialFeatures
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=0)
pr = PolynomialFeatures(degree=5)
x_train_pr = pr.fit_transform(x_train[['horsepower']])
x_test_pr = pr.fit_transform(x_test[['horsepower']])
print(pr)

#create a linear regression model "poly" and train it
poly = LinearRegression()
poly.fit(x_train_pr, y_train)
yhat = poly.predict(x_test_pr)
yhat[0:5]
print("Predicted values:", yhat[0:4])
print("True values:", y_test[0:4].values)

PollyPlot(x_train[['horsepower']], x_test[['horsepower']], y_train, y_test, poly,pr)

#R^2 of the training data:
poly.score(x_train_pr, y_train)

#R^2 of the test data:
poly.score(x_test_pr, y_test)


#Ridge regression
pr=PolynomialFeatures(degree=2)
x_train_pr=pr.fit_transform(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])
x_test_pr=pr.fit_transform(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])

from sklearn.linear_model import Ridge
#create a Ridge regression object, setting the regularization parameter to 0.1
RigeModel=Ridge(alpha=0.1)
RigeModel.fit(x_train_pr, y_train)
RigeModel.score(x_test_pr, y_test)
yhat = RigeModel.predict(x_test_pr)
print('predicted:', yhat[0:4])
print('test set :', y_test[0:4].values)

#Grid Search
from sklearn.model_selection import GridSearchCV
parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000],'normalize':[True,False]}]
print(parameters1)
RR=Ridge()
RR

#Create a ridge grid search object
Grid1=GridSearchCV(RR,parameters1,cv=4)
#Fit the model
Grid1.fit(x_data[['horsepower','curb-weight','engine-size','highway-mpg']],y_data)
BestRR=Grid1.best_estimator_
print(BestRR)


# test our model on the test data
print(BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_test))


 
 
 
 
 
 
 
 
 
 
 
 

1 comment:

  1. Hii, I have a doubt

    can you please tell me why we choose yhat[0:5] in the code below as according to my understandin if whe are using cv 4 then yhat should give 4 values please let me know if i have understood something wrong.

    yhat = cross_val_predict(lre,x_data[['horsepower']], y_data,cv=4)
    yhat[0:5]

    ReplyDelete