|
import pandas as pd
import numpy as np
from test.test_functools import capture
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
'exec(%matplotlib inline)'
#load data and store in dataframe df:
path="C:/Users/thakudev/PYTHON/Data/module_5_auto.csv"
df=pd.read_csv(path)
#print(df.head())
df.to_csv("module_5_auto.csv")
df=df._get_numeric_data()
print(df.head())
#Libraries for plotting
from IPython.display import display
from ipywidgets import widgets
from IPython.display import display
from ipywidgets import interact, interactive,
fixed, interact_manual
#Functions for plotting
def DistributionPlot(RedFunction,
BlueFunction, RedName, BlueName, Title):
width = 12
height = 10
plt.figure(figsize=(width,
height))
ax1 =
sns.distplot(RedFunction, hist=False, color="r", label=RedName)
ax2 =
sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)
plt.title(Title)
plt.xlabel('Price (in dollars)')
plt.ylabel('Proportion of Cars')
plt.show()
plt.close()
def PollyPlot(xtrain, xtest,
y_train, y_test, lr,poly_transform):
width = 12
height = 10
plt.figure(figsize=(width, height))
#training data
#testing data
# lr: linear
regression object
#poly_transform:
polynomial transformation object
xmax=max([xtrain.values.max(), xtest.values.max()])
xmin=min([xtrain.values.min(), xtest.values.min()])
x=np.arange(xmin,
xmax, 0.1)
plt.plot(xtrain,
y_train, 'ro', label='Training Data')
plt.plot(xtest,
y_test, 'go', label='Test Data')
plt.plot(x,
lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
plt.ylim([-10000, 60000])
plt.ylabel('Price')
plt.legend()
#An important step in testing your model is to split your data
into training and testing data.
y_data=df['price']
x_data=df.drop('price',axis=1)
#we randomly split our data into training and testing data
using the function
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.15, random_state=1)
print("Number
of test samples:",x_test.shape[0])
print("Number
of Training ",x_train.shape[0])
from sklearn.linear_model import LinearRegression
lre=LinearRegression()
print("1:",lre.fit(x_train[['horsepower']],y_train))
#cal R^2 on the test data
print("2:",lre.score(x_test[['horsepower']],y_test))
print("3:",lre.score(x_train[['horsepower']],y_train))
#cross_val_score.
from sklearn.model_selection import cross_val_score
Rcross=cross_val_score(lre, x_data[['horsepower']],y_data,cv=4)
print("Rcross",Rcross)
print("The
mean of the folds are",Rcross.mean(),"and the standard deviation is",Rcross.std())
-1 *
cross_val_score(lre,x_data[['horsepower']], y_data,cv=4,scoring='neg_mean_squared_error')
from sklearn.model_selection import cross_val_predict
yhat = cross_val_predict(lre,x_data[['horsepower']],
y_data,cv=4)
yhat[0:5]
#create Multiple linear regression objects and train the model
lr = LinearRegression()
lr.fit(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train)
#Prediction using training data:
yhat_train = lr.predict(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print("yhat_train",yhat_train[0:5])
#Prediction using test data:
yhat_test = lr.predict(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']])
print("yhat_test",yhat_test[0:5])
Title = 'Distribution Plot of
Predicted Value Using Training Data vs Training Data
Distribution'
DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values
(Train)", Title)
Title='Distribution Plot of
Predicted Value Using Test Data vs Data Distribution of Test
Data'
DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values
(Test)",Title)
#Overfitting
from sklearn.preprocessing import PolynomialFeatures
x_train, x_test, y_train, y_test = train_test_split(x_data,
y_data, test_size=0.45, random_state=0)
pr = PolynomialFeatures(degree=5)
x_train_pr = pr.fit_transform(x_train[['horsepower']])
x_test_pr = pr.fit_transform(x_test[['horsepower']])
print(pr)
#create a linear regression model "poly" and
train it
poly = LinearRegression()
poly.fit(x_train_pr, y_train)
yhat = poly.predict(x_test_pr)
yhat[0:5]
print("Predicted
values:", yhat[0:4])
print("True
values:", y_test[0:4].values)
PollyPlot(x_train[['horsepower']], x_test[['horsepower']], y_train, y_test, poly,pr)
#R^2 of the training data:
poly.score(x_train_pr, y_train)
#R^2 of the test data:
poly.score(x_test_pr, y_test)
#Ridge regression
pr=PolynomialFeatures(degree=2)
x_train_pr=pr.fit_transform(x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])
x_test_pr=pr.fit_transform(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg','normalized-losses','symboling']])
from sklearn.linear_model import Ridge
#create a Ridge regression object, setting the regularization
parameter to 0.1
RigeModel=Ridge(alpha=0.1)
RigeModel.fit(x_train_pr, y_train)
RigeModel.score(x_test_pr, y_test)
yhat = RigeModel.predict(x_test_pr)
print('predicted:', yhat[0:4])
print('test
set :', y_test[0:4].values)
#Grid Search
from sklearn.model_selection import GridSearchCV
parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000],'normalize':[True,False]}]
print(parameters1)
RR=Ridge()
RR
#Create a ridge grid search object
Grid1=GridSearchCV(RR,parameters1,cv=4)
#Fit the model
Grid1.fit(x_data[['horsepower','curb-weight','engine-size','highway-mpg']],y_data)
BestRR=Grid1.best_estimator_
print(BestRR)
# test our model on the test data
print(BestRR.score(x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_test))
|