Deep Kernel Learning#
Author: Zeel B Patel
We will attempt to implement an idea presented in [WHSX15].
import scipy.stats
from scipy.optimize import minimize
import numpy
import matplotlib.pyplot as plt
from matplotlib import rc
from autograd import numpy as np
from autograd import elementwise_grad as egrad
from autograd import grad
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
rc('text', usetex=True)
rc('font', size=16)
Defining activation functions.
def relu(z):
return z.clip(0, np.inf)
def sigmoid(z):
return 1./(1+np.exp(-z))
def tanh(z):
return (np.exp(z)-1)/(1+np.exp(z))
Let us visualize these functions.
z = np.linspace(-10,10,100)
fig, ax = plt.subplots(1,3,figsize=(12,4))
ax[0].plot(relu(z), label='relu')
ax[1].plot(sigmoid(z), label='sigmoid')
ax[2].plot(tanh(z), label='tanh')
for axx in ax:
axx.legend();
axx.set_xlabel('z')
ax[0].set_ylabel('f(z)');
data:image/s3,"s3://crabby-images/f4310/f4310103368faba004552af2bd210b0ba5bdecb6" alt="../../_images/ce0a99abfd9406699d7cf87b66cda2e748d2997dfcd75d85d43446bf5763d168.png"
We will consider a step function dataset for this task.
num_low=25
num_high=25
gap = -.1
noise=0.0001
X = np.vstack((np.linspace(-1, -gap/2.0, num_low)[:, np.newaxis],
np.linspace(gap/2.0, 1, num_high)[:, np.newaxis]))
y = np.vstack((np.zeros((num_low, 1)), np.ones((num_high,1))))
scale = np.sqrt(y.var())
offset = y.mean()
y = (y-offset)/scale
plt.scatter(X, y);
plt.xlabel('X')
plt.ylabel('y');
data:image/s3,"s3://crabby-images/817dc/817dcefb90c5303799e1ad296a9371ab0b9c70a8" alt="../../_images/ec0d9bbd8643d12ab78c9464e2e21b9e896808289ac06508507f0e0dd57751bd.png"
We will initialize the weights and variables to hold intermediate outputs using a function.
def initialize(seed, sol=[1, 3, 2, 1]):
np.random.seed(seed)
size_of_layers = sol
W = [None]*len(size_of_layers)
b = [None]*len(size_of_layers)
# Dummy
W[0] = np.array([0])
b[0] = np.array([0])
for i in range(1, len(size_of_layers)):
W[i] = np.random.rand(size_of_layers[i], size_of_layers[i-1])
b[i] = np.random.rand(size_of_layers[i], 1)
Z = [None]*(len(W))
Z[0] = np.array([0])
A = [X]
A.extend([None]*(len(W)-1))
sigma, sigma_n, l = np.random.rand(3)
activations = ['relu']*(len(size_of_layers)-2)
activations.insert(0, 'empty')
activations.append('tanh')
return W, b, Z, A, sigma, sigma_n, l, activations
activation_func = {'sigmoid':sigmoid, 'relu':relu, 'tanh':tanh}
Let us define the RBF kernel and Negative log likelihood (including a forward pass over neural network).
def RBF(x1, x2, l, sigma):
d = np.square(x1 - x2.T)
d_scaled = d/np.square(l)
return sigma**2 * np.exp(-d_scaled)
def NegLogLikelihood(W, b, l, sigma, sigma_n):
for i in range(1, len(W)):
Z[i] = A[i-1]@(W[i].T) + b[i].T
A[i] = activation_func[activations[i]](Z[i])
X_hat = A[-1]
K = RBF(X_hat, X_hat, l, sigma)
K += np.eye(X.shape[0])*sigma_n**2
nll = 0.5*y.T@np.linalg.inv(K)@y + 0.5*np.log(np.linalg.det(K)) + 0.5*np.log(2*np.pi)
return nll[0,0]
Now, we optimize the kernel paramaters as well as weights of neural networks using autograd
.
grad_func = grad(NegLogLikelihood, argnum=[0,1,2,3,4])
W, b, Z, A, sigma, sigma_n, l, activations = initialize(seed=0)
lr = 0.0001
loss = []
for iters in range(500):
dW, db, dl, dsigma, dsigma_n = grad_func(W, b, l, sigma, sigma_n)
for i in range(len(W)):
W[i] = W[i] - lr*dW[i]
for i in range(len(b)):
b[i] = b[i] - lr*db[i]
l = l - lr*dl
sigma = sigma - lr*dsigma
sigma_n = sigma_n - lr*dsigma_n
loss.append(NegLogLikelihood(W, b, l, sigma, sigma_n))
plt.plot(loss);
plt.xlabel('Iterations')
plt.ylabel('Loss (Neg. Log Likelihood)');
data:image/s3,"s3://crabby-images/b09f2/b09f2a72c72c4412c15facc2f7eb558e5043635d" alt="../../_images/df884bb6b10307af582d2ce586973b24fb39628fc0f28aba6a384675002c9569.png"
Let us define a function to predict the output over new inputs.
def predict(X_new):
# Getting X_hat
A[0] = X
for i in range(1, len(W)):
Z[i] = A[i-1]@(W[i].T) + b[i].T
A[i] = activation_func[activations[i]](Z[i])
X_hat = A[-1]
# Getting X_new_hat
A[0] = X_new
for i in range(1, len(W)):
Z[i] = A[i-1]@(W[i].T) + b[i].T
A[i] = activation_func[activations[i]](Z[i])
X_new_hat = A[-1]
K = RBF(X_hat, X_hat, l, sigma)
K += np.eye(X_hat.shape[0])*sigma_n**2
K_inv = np.linalg.inv(K)
K_star = RBF(X_hat, X_new_hat, l, sigma)
K_star_star = RBF(X_new_hat, X_new_hat, l, sigma)
K_star_star += np.eye(X_new_hat.shape[0])*sigma_n**2 # include likelihood noise
mu = K_star.T@K_inv@y
cov = K_star_star - K_star.T@K_inv@K_star
return mu.squeeze(), cov
Now, we visualize predicted mean and variance at new inputs.
X_new = np.linspace(-2, 2, 100).reshape(-1,1)
mu, cov = predict(X_new)
std2 = np.sqrt(cov.diagonal())*2
plt.scatter(X, y)
plt.plot(X_new, mu, label='predicted mean');
plt.fill_between(X_new.squeeze(), mu-std2, mu+std2, alpha=0.4, label='95\% confidence');
plt.legend(bbox_to_anchor=(1,1));
data:image/s3,"s3://crabby-images/08ef4/08ef485348e4384d59a71710ac9a4d2b472d9118" alt="../../_images/d253b126aee77f024bcf12800ce796182a15cdc6fe932b82ffcc910875539ffa.png"
Let us try one more dataset.
# Generate data
def f(X): # target function
return numpy.sin(5*X) + numpy.sign(X)
X = numpy.sort(numpy.random.uniform(-1, 1, (30, 1))) # data
y = f(X)[:, 0].reshape(-1,1)
plt.scatter(X, y);
data:image/s3,"s3://crabby-images/83294/832944fb25d2ed798b1d1a730c3180f7d9ab7f5b" alt="../../_images/0ee0ade68ca03a761edb5a42ef4c776a8dedc96d53b516eb6393350a0c721a3c.png"
grad_func = grad(NegLogLikelihood, argnum=[0,1,2,3,4])
W, b, Z, A, sigma, sigma_n, l, activations = initialize(seed=0, sol=[1,3,1])
lr = 0.01
loss = []
for iters in range(200):
dW, db, dl, dsigma, dsigma_n = grad_func(W, b, l, sigma, sigma_n)
for i in range(len(W)):
W[i] = W[i] - lr*dW[i]
for i in range(len(b)):
b[i] = b[i] - lr*db[i]
l = l - lr*dl
sigma = sigma - lr*dsigma
sigma_n = sigma_n - lr*dsigma_n
loss.append(NegLogLikelihood(W, b, l, sigma, sigma_n))
plt.plot(loss);
plt.xlabel('Iterations')
plt.ylabel('Loss (Neg. Log Likelihood)');
data:image/s3,"s3://crabby-images/298fe/298feec2efd550a8a3a5950cf6f326d819cea6c7" alt="../../_images/a8fd110d3b9ea7f1e442e20027de19b34632d9de758daeaf44281cb30bc9fa1a.png"
X_new = np.linspace(X.min(), X.max(), 100).reshape(-1,1)
mu, cov = predict(X_new)
std2 = np.sqrt(cov.diagonal())*2
plt.scatter(X, y)
plt.plot(X_new, mu, label='predicted mean');
plt.fill_between(X_new.squeeze(), mu-std2, mu+std2, alpha=0.4, label='95\% confidence');
plt.legend(bbox_to_anchor=(1,1));
data:image/s3,"s3://crabby-images/1de16/1de167ef42ed92668c555e6dd42f90b1bac464dc" alt="../../_images/5c9b9ba3046bbcb943b5e3659284c93fc72a5197f5c658bad055cdf2950a7648.png"