Using PRNG key

Contents

import jax.numpy as jnp
from jax import grad, jit, vmap
from jax import random
import jax
import numpy as np
import optax

jax.devices()

WARNING:absl:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)

[CpuDevice(id=0)]

# y_hat = x^t * theta for a single point

def pred_yhat(x, theta):
    return jnp.dot(x, theta[1:]) + theta[0]

x = jnp.array([2., 2.])
theta = jnp.array([-1., 2., 3.])

pred_yhat(x, theta)

DeviceArray(9., dtype=float32)

Using PRNG key#

key = random.PRNGKey(0)
X = random.normal(key, (100, 2))
print(X.shape)

(100, 2)

VMAP for auto-batching!#

%timeit vmap(pred_yhat, in_axes=(0, None))(X, theta).block_until_ready()

1.31 ms ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%timeit jnp.stack([pred_yhat(x, theta) for x in X]).block_until_ready()

98 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%timeit X@theta[1:] + theta[0]

893 µs ± 2.61 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

JIT for speedup#

%timeit vmap(jit(pred_yhat), in_axes=(0, None))(X, theta).block_until_ready()

289 µs ± 2.31 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

pred_y_hat_vector = lambda X, theta: vmap(jit(pred_yhat), in_axes=(0, None))(X, theta)

def cost(X, y, theta):
    y_hat_vec = pred_y_hat_vector(X, theta)
    error_square = jnp.linalg.norm(y - y_hat_vec, 2)
    return error_square

theta_gt = jnp.array([1., 4., 5.])

y_gt = vmap(jit(pred_yhat), in_axes=(0, None))(X, theta_gt) + 0.2*random.normal(key, (100, 1))

cost(X, y_gt, theta_gt)

DeviceArray(18.885782, dtype=float32)

Our initial estimates (theta) are not good

cost(X, y_gt, theta)

DeviceArray(377.96906, dtype=float32)

grad_theta = grad(cost, argnums=[2])

lr = 0.001
for i in range(50):
    cost_val = cost(X, y_gt, theta)
    print(i, cost_val)
    grad_theta_val = grad_theta(X, y_gt, theta)[0]
    theta = theta - lr*grad_theta_val

theta

DeviceArray([1.0210268, 3.881022 , 4.9754868], dtype=float32)

theta_gt

DeviceArray([1., 4., 5.], dtype=float32)

Using Optax instead of manually writing SGD#

#optimizer = optax.adam(learning_rate=0.01)
optimizer = optax.sgd(learning_rate=0.001)

theta = jnp.array([-1., 2., 3.])
opt_state = optimizer.init(theta)

opt_state

(EmptyState(), EmptyState())

for i in range(50):
    cost_val = cost(X, y_gt, theta)
    print(i, cost_val)
    grad_theta_val = grad_theta(X, y_gt, theta)[0]
    updates, opt_state = optimizer.update(grad_theta_val, opt_state)
    theta = optax.apply_updates(theta, updates)

Is JAX quicker (even on CPU?!)#

Gaussian Processes need Cholesky decompositions. Can we get a speedup using JAX instead of Numpy?

a = np.random.randn(1000, 1000)
b = a.T@a

%timeit np.linalg.cholesky(b)

6.82 ms ± 464 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

b = jnp.array(b)

%timeit jnp.linalg.cholesky(b)

2.19 ms ± 67.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)