Implementation of AdaBoost with Boosting a Linear Regression Based Classifier

Data used for this implementation is available at Github Link

In [2]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm 
%matplotlib inline
plt.rcParams["figure.figsize"] = (10,8)
In [18]:
class LeastSquaresRegression(object):
    def __init__(self):
        self.weights = []
    
    def calcLeastSquares(self, X, y):
        xtranspose = np.transpose(X)
        xtransx = np.dot(xtranspose, X)
        if xtransx.shape[0] != xtransx.shape[1]:
            raise ValueError('Needs to be a square matrix for inverse')
        matinv = np.linalg.inv(xtransx)
        xtransy = np.dot(xtranspose, y)
        self.weights = np.dot(matinv, xtransy)

    def makePredictions(self, X):
        class_output = np.dot(X, self.weights)
        return np.sign(class_output)
In [19]:
class Boosting(object):

    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.N = self.X_train.shape[0]
        self.y_train = y_train
        self.weights = np.ones(self.N)/self.N
        self.epsilont = []
        self.alphas = []
        self.classifiers = []
        self.histogram = {}
        #self.num_estimators = T
        
    def doBoosting(self):
#         for t in range(self.num_estimators):
        for t in range(1500):
            output = np.random.choice(self.N, self.N, p=self.weights)
            
            for t in output:
                self.histogram[t] = self.histogram.get(t, 0) + 1
            
#             print output
            B_Xtrain = self.X_train[output]
            B_ytrain = self.y_train[output]
        
            ls = LeastSquaresRegression()
            ls.calcLeastSquares(B_Xtrain, B_ytrain)
#             print ls.weights
            Y_pred = ls.makePredictions(self.X_train)
            
#             print Y_pred
            
            e_t = np.sum((Y_pred != self.y_train) * self.weights)
#             print e_t
            if e_t > 0.5:
                ls.weights = -ls.weights
                Y_pred = ls.makePredictions(self.X_train)
                e_t = np.sum((Y_pred != self.y_train) * self.weights)
            
#             print e_t
            self.epsilont.append(e_t)

            alpha_t = 0.5 * np.log((1 - e_t)/e_t)
            self.alphas.append(alpha_t)
            self.classifiers.append(ls)
            
#             print alpha_t   
            self.weights = self.weights * np.exp(-alpha_t* Y_pred * self.y_train)
            self.weights = self.weights / np.sum(self.weights)
In [20]:
X_train = np.genfromtxt('boosting/X_train.csv', delimiter=',')
y_train = np.genfromtxt('boosting/y_train.csv')
X_test = np.genfromtxt('boosting/X_test.csv', delimiter=',')
y_test = np.genfromtxt('boosting/y_test.csv')


def append_column_one(data):    
    append_ones = np.ones((data.shape[0],1))
    data = np.hstack((append_ones, data))
    return data

X_train = append_column_one(X_train)
X_test = append_column_one(X_test)
In [21]:
training_error = []
testing_error = []

boost = Boosting(X_train, y_train)
boost.doBoosting()

for t in tqdm(range(1, 1501)):
    sum_train = np.zeros(X_train.shape[0])
    sum_test = np.zeros(X_test.shape[0])
    for i in range(t):
        alpha = boost.alphas[i]
        classifier = boost.classifiers[i]
#             print X_train.shape
#             print classifier.weights.shape
#             print np.dot(X_train, classifier.weights).shape
        sum_train += (alpha * classifier.makePredictions(X_train))
        sum_test += (alpha * classifier.makePredictions(X_test))
    fboost_train_pred = np.sign(sum_train)
    fboost_test_pred = np.sign(sum_test)

    training_error.append(np.sum(fboost_train_pred != y_train)/y_train.shape[0])
    testing_error.append(np.sum(fboost_test_pred != y_test)/y_test.shape[0])
100%|██████████| 1500/1500 [00:50<00:00, 29.75it/s]

(a) Plotting the testing vs training error for t = 1, 2, ... 1500

In [22]:
plt.figure()
plt.plot(range(1,1501), training_error, label="Training error")
plt.plot(range(1,1501), testing_error, label="Testing error")
plt.title("Training and Testing error of fboost(t) for t = 1, 2, ... 1500")
plt.legend()
plt.show()

(b) Upper bound on the training error as a function of t

In [23]:
training_upper_bound = []
for t in tqdm(range(1, 1501)):
    ub = 0
    for i in range(t):
        epsilon = boost.epsilont[i]
        ub += np.power((0.5 - epsilon), 2)
    training_upper_bound.append(np.exp(-2 * ub))
100%|██████████| 1500/1500 [00:05<00:00, 272.81it/s]
In [24]:
plt.figure()
plt.plot(range(1, 1501), training_upper_bound, label="upper bound line")
plt.title("Upper bound of training error as a function of t")
plt.legend()
plt.show()

(c) Histogram of the dimensions

In [25]:
plt.figure()
plt.stem(list(boost.histogram.keys()), boost.histogram.values(), color='g')
plt.xlabel("Dimensions")
plt.ylabel("Count of the dimensions")
plt.show()
In [26]:
#np.sum(boost.histogram.values())

(d) Epsilon and Alpha as a function of t

In [27]:
## Epsilon
plt.figure()
plt.plot(range(1, 1501), boost.epsilont)
plt.title("Epsilon as a function of t")
plt.show()
In [28]:
## Alphas
plt.figure()
plt.plot(range(1, 1501), boost.alphas)
plt.title("Alphas as a function of t")
plt.show()




Comments

comments powered by Disqus