%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt


np.random.seed(42)

f = lambda x: 7 + x * 3 + np.random.randn(len(x)) * 10
x = np.random.random(50000) * 100
y = f(x) 
m = len(y)
batch = 100 # batch size


h = lambda theta_0, theta_1, x: theta_0 + theta_1 * x
cost = lambda theta_0, theta_1, x_i, y_i: 0.5 * (h(theta_0, theta_1, x_i) - y_i) ** 2


theta_old = np.array([0.,0.])
theta_new = np.array([1.,1.]) # The algorithm starts at [1,1]
lr = 0.00005 # learning rate
batch = 1 # batch size

iter_num = 0
s_k = np.array([float("inf"), float("inf")])
sum_cost = 0
cost_list = []

for i in range(m * 20):
    iter_num += 1
    # random choice of matching x and y
    idx = np.random.randint(0, m, size=batch)
    x_i = x[idx]  # random choice of x
    y_i = y[idx]  # random choice of y
    
    theta_old = theta_new
    pred_y = h(theta_old[0], theta_old[1], x_i)
    s_k[0] = pred_y - y_i
    s_k[1] = (pred_y - y_i) * x_i
    s_k = (-1) * s_k
    theta_new = theta_old + lr * s_k
    
    # saved for plot
    sum_cost += cost(theta_old[0], theta_old[1], x_i, y_i)
    if (i + 1) % 10000 == 0:
        cost_list.append(sum_cost / 10000.0)
        sum_cost = 0   
            
print("Local minimum occurs where:")
print("theta_0 =", theta_new[0])
print("theta_1 =", theta_new[1])

Local minimum occurs where:
theta_0 = 7.07789735537464
theta_1 = 3.0368460823859147


iterations = np.arange(len(cost_list)) * 10000
plt.plot(iterations,cost_list)
plt.xlabel("iterations")
plt.ylabel("avg cost")
plt.show()


import torch

torch.manual_seed(42)

lr = 0.00005 # learning rate or step size

# let's get our data points x and y into the expected format
x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

# theta_0, theta_1
theta = torch.tensor([1., 1.], requires_grad=True, dtype=torch.float)

iter_num = 0
cost_list = []

# stopping condition based on iteration count
for i in range(m * 5):
    iter_num += 1
    # random choice of matching x and y
    idx = np.random.randint(0, m, size=batch)
    x_i = x_tensor[idx]
    y_i = y_tensor[idx]
    
    yhat = theta[0] + theta[1] * x_i
    error = y_i - yhat
    loss = torch.mul(0.5, (error ** 2)).mean()
    sum_cost += float(loss.data)

    loss.backward() # this is the magic

    theta.data -= lr * theta.grad
    theta.grad.detach()
    theta.grad.zero_()
    
    # saved for plot
    if (i + 1) % 10000 == 0:
        cost_list.append(sum_cost / 10000.0)
        sum_cost = 0
            
print("Local minimum occurs where:")
print("theta_0 =", float(theta[0].data))
print("theta_1 =", float(theta[1].data))

Local minimum occurs where:
theta_0 = 6.781114101409912
theta_1 = 2.968665599822998


iterations = np.arange(len(cost_list)) * 10000
plt.plot(iterations,cost_list)
plt.xlabel("iterations")
plt.ylabel("avg cost")
plt.show()

Algorithm	Time per iter	Total time to convergence for large data in theory	Total time to convergence for large data in practice	Sensitivity to params
Gradient Descent	Slow for large data	Slower	Usually slower	Moderate
Stochastic Gradient	Always fast	Faster	Usually faster	Very high

Stochastic Gradient Descent¶