Implementation of AdaBoost with Boosting a Linear Regression Based Classifier¶
Data used for this implementation is available at Github Link
In [2]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,8)
In [18]:
class LeastSquaresRegression(object):
def __init__(self):
self.weights = []
def calcLeastSquares(self, X, y):
xtranspose = np.transpose(X)
xtransx = np.dot(xtranspose, X)
if xtransx.shape[0] != xtransx.shape[1]:
raise ValueError('Needs to be a square matrix for inverse')
matinv = np.linalg.inv(xtransx)
xtransy = np.dot(xtranspose, y)
self.weights = np.dot(matinv, xtransy)
def makePredictions(self, X):
class_output = np.dot(X, self.weights)
return np.sign(class_output)
In [19]:
class Boosting(object):
def __init__(self, X_train, y_train):
self.X_train = X_train
self.N = self.X_train.shape[0]
self.y_train = y_train
self.weights = np.ones(self.N)/self.N
self.epsilont = []
self.alphas = []
self.classifiers = []
self.histogram = {}
#self.num_estimators = T
def doBoosting(self):
# for t in range(self.num_estimators):
for t in range(1500):
output = np.random.choice(self.N, self.N, p=self.weights)
for t in output:
self.histogram[t] = self.histogram.get(t, 0) + 1
# print output
B_Xtrain = self.X_train[output]
B_ytrain = self.y_train[output]
ls = LeastSquaresRegression()
ls.calcLeastSquares(B_Xtrain, B_ytrain)
# print ls.weights
Y_pred = ls.makePredictions(self.X_train)
# print Y_pred
e_t = np.sum((Y_pred != self.y_train) * self.weights)
# print e_t
if e_t > 0.5:
ls.weights = -ls.weights
Y_pred = ls.makePredictions(self.X_train)
e_t = np.sum((Y_pred != self.y_train) * self.weights)
# print e_t
self.epsilont.append(e_t)
alpha_t = 0.5 * np.log((1 - e_t)/e_t)
self.alphas.append(alpha_t)
self.classifiers.append(ls)
# print alpha_t
self.weights = self.weights * np.exp(-alpha_t* Y_pred * self.y_train)
self.weights = self.weights / np.sum(self.weights)
In [20]:
X_train = np.genfromtxt('boosting/X_train.csv', delimiter=',')
y_train = np.genfromtxt('boosting/y_train.csv')
X_test = np.genfromtxt('boosting/X_test.csv', delimiter=',')
y_test = np.genfromtxt('boosting/y_test.csv')
def append_column_one(data):
append_ones = np.ones((data.shape[0],1))
data = np.hstack((append_ones, data))
return data
X_train = append_column_one(X_train)
X_test = append_column_one(X_test)
In [21]:
training_error = []
testing_error = []
boost = Boosting(X_train, y_train)
boost.doBoosting()
for t in tqdm(range(1, 1501)):
sum_train = np.zeros(X_train.shape[0])
sum_test = np.zeros(X_test.shape[0])
for i in range(t):
alpha = boost.alphas[i]
classifier = boost.classifiers[i]
# print X_train.shape
# print classifier.weights.shape
# print np.dot(X_train, classifier.weights).shape
sum_train += (alpha * classifier.makePredictions(X_train))
sum_test += (alpha * classifier.makePredictions(X_test))
fboost_train_pred = np.sign(sum_train)
fboost_test_pred = np.sign(sum_test)
training_error.append(np.sum(fboost_train_pred != y_train)/y_train.shape[0])
testing_error.append(np.sum(fboost_test_pred != y_test)/y_test.shape[0])
(a) Plotting the testing vs training error for t = 1, 2, ... 1500¶
In [22]:
plt.figure()
plt.plot(range(1,1501), training_error, label="Training error")
plt.plot(range(1,1501), testing_error, label="Testing error")
plt.title("Training and Testing error of fboost(t) for t = 1, 2, ... 1500")
plt.legend()
plt.show()
(b) Upper bound on the training error as a function of t¶
In [23]:
training_upper_bound = []
for t in tqdm(range(1, 1501)):
ub = 0
for i in range(t):
epsilon = boost.epsilont[i]
ub += np.power((0.5 - epsilon), 2)
training_upper_bound.append(np.exp(-2 * ub))
In [24]:
plt.figure()
plt.plot(range(1, 1501), training_upper_bound, label="upper bound line")
plt.title("Upper bound of training error as a function of t")
plt.legend()
plt.show()
(c) Histogram of the dimensions¶
In [25]:
plt.figure()
plt.stem(list(boost.histogram.keys()), boost.histogram.values(), color='g')
plt.xlabel("Dimensions")
plt.ylabel("Count of the dimensions")
plt.show()
In [26]:
#np.sum(boost.histogram.values())
(d) Epsilon and Alpha as a function of t¶
In [27]:
## Epsilon
plt.figure()
plt.plot(range(1, 1501), boost.epsilont)
plt.title("Epsilon as a function of t")
plt.show()
In [28]:
## Alphas
plt.figure()
plt.plot(range(1, 1501), boost.alphas)
plt.title("Alphas as a function of t")
plt.show()
Comments
comments powered by Disqus