""" CSE 546: Recitation 3 October 15, 2013: Logistic Regression """ from __future__ import division # enables decimal division by default import numpy as np import matplotlib.pyplot as plt import sklearn.linear_model as skl # from package scikit-learn, you may need to get from mpl_toolkits.mplot3d import Axes3D """ the sigmoid """ # for plotting 2D sigmoid def plot(x,y): plt.axvline() plt.plot(x,y,color='r',lw=2) # for plotting 3D sigmoid def plot3d(x,y,z): fig = plt.figure() ax = Axes3D(fig) ax.plot_surface(x,y,z,rstride=1,cstride=1) return ax def sigmoid(x): # s(x) = 1 / (1 + e^x) return 1.0/(1 + np.exp(x)) a = -5 # (a,b) the interval to show b = 6 ### visualize 2D sigmoid x = np.arange(a,b,.1) # standard sigmoid plt.figure() w0 = 0.0 w1 = 1.0 y = sigmoid(w0 + w1*x) plt.subplot(131) plot(x,y) # increase magnitude of weight w1 = 3.0 y = sigmoid(w0 + w1*x) plt.subplot(132) plot(x,y) w1 = 9.0 y = sigmoid(w0 + w1*x) plt.subplot(133) plot(x,y) ### visualize 3D sigmoid x = np.arange(a,b) x1,x2 = np.meshgrid(x,x) w0 = 2.0 w1 = 1.0 w2 = -1.0 z = sigmoid(w0 + w1*x1 + w2*x2) ax = plot3d(x1,x2,z) # examine the decision boundary: logistic regression is a linear classifier! y = -w0/w2 - w1/w2 * x for h in [0,.25,.5,.75,1.0]: ax.plot(x,y,h,color='r',lw=2) plt.show() """ overfitting in logistic regression """ n = 10 ### showing why regularization is necessary for linearly separable data mu1 = [0.0,0.0] mu2 = [5.0,5.0] sigma = 1.0 x = np.random.randn(n,2) pos = x*sigma + mu1 # +ve examples x = np.random.randn(n,2) neg = x*sigma + mu2 # -ve examples plt.scatter(pos[:,0], pos[:,1],s=100,color='b') plt.scatter(neg[:,0], neg[:,1],s=100,color='r') plt.show() # use very small lambda for little regularization plt.figure() lmb = 1.0e-7 model1 = skl.LogisticRegression(C=1.0/lmb) x = np.vstack((pos,neg)) y = np.repeat(np.array([1, 0]), n) model1.fit(x,y) w = model1.coef_.ravel() w0 = model1.intercept_; plt.scatter(pos[:,0], pos[:,1],s=100,color='b') plt.scatter(neg[:,0], neg[:,1],s=100,color='r') a = np.arange(-2,7) b = -w0/w[1] - w[0]/w[1] * a plt.plot(a,b,color='g') a1,a2 = np.meshgrid(a,a) z = sigmoid(w0 + w[0]*a1 + w[1]*a2) plot3d(a1,a2,z) plt.show() # use very high lambda for high regularization plt.figure() lmb = 1.0e-1 model1 = skl.LogisticRegression(C=1.0/lmb) x = np.vstack((pos,neg)) y = np.repeat(np.array([1, 0]), n) model1.fit(x,y) w = model1.coef_.ravel() w0 = model1.intercept_; plt.scatter(pos[:,0], pos[:,1],s=100,color='b') plt.scatter(neg[:,0], neg[:,1],s=100,color='r') a = np.arange(-2,7) b = -w0/w[1] - w[0]/w[1] * a plt.plot(a,b,color='g') a1,a2 = np.meshgrid(a,a) z = sigmoid(w0 + w[0]*a1 + w[1]*a2) plot3d(a1,a2,z) plt.show() ### non-separable data ==> less overfitting even without regularization plt.figure() mu1 = [0.0,0.0] mu2 = [3.0,3.0] sigma = 2.3 x = np.random.randn(n,2) pos = x*sigma + mu1 # +ve examples x = np.random.randn(n,2) neg = x*sigma + mu2 # -ve examples # use very small lambda for little regularization lmb = 1.0e-7 model1 = skl.LogisticRegression(C=1.0/lmb) x = np.vstack((pos,neg)) y = np.repeat(np.array([1, 0]), n) model1.fit(x,y) w = model1.coef_.ravel() w0 = model1.intercept_; plt.scatter(pos[:,0], pos[:,1],s=100,color='b') plt.scatter(neg[:,0], neg[:,1],s=100,color='r') a = np.arange(-2,7) b = -w0/w[1] - w[0]/w[1] * a plt.plot(a,b,color='g') a1,a2 = np.meshgrid(a,a) z = sigmoid(w0 + w[0]*a1 + w[1]*a2) plot3d(a1,a2,z) plt.show() """ gradient descent """ ### generate data plt.figure() n = 100 w0 = 4.5 w1 = 2.5 x = np.arange(0,n) np.random.shuffle(x) noise = np.random.randn(n) * 2.0 y = w0 + w1*x + noise plt.scatter(x,y) plt.show() # ordinary least squares with one feature by exact formula def ls_exact(x,y): w1h = sum(x*y) - sum(x)*np.mean(y) w1h /= sum(x**2) - sum(x)**2/len(x) w0h = np.mean(y) - w1h*np.mean(x) yh = w0h + w1h*x mse = np.mean((yh-y)**2) return (w0h, w1h, mse, yh) # ordinary least squares with one feature using batch gradient descent # stopping criterion is when MSE decreases by delta or less # step size is eta def ls_with_gd(x,y,eta,delta): w0h = 0.0 w1h = 0.0 yh = w0h + w1h*x mse = np.mean((yh-y)**2) converged = False t = 0 while not converged: w0h -= eta * 2 * np.mean(yh - y) w1h -= eta * 2 * np.mean(x*(yh - y)) yh = w0h + w1h*x msep = np.mean((yh-y)**2) chg = mse - msep converged = chg <= delta mse = msep t += 1 if t == 300: break if t % 1000 == 0: print "iter", t, "; MSE decreased", chg, '; MSE =', mse, \ '; w0h =', w0h, '; w1h =', w1h return (w0h, w1h, mse, yh, t) # ordinary least squares with one feature using stochastic gradient descent def ls_with_sgd(x,y,eta,delta): w0h = 0.0 w1h = 0.0 yh = w0h + w1h*x mse = np.mean((yh-y)**2) converged = False t = 0 while not converged: for i in xrange(0,n): yhi = w0h + w1h*x[i] w0h -= eta * 2 * (yhi - y[i]) w1h -= eta * 2 * x[i]*(yhi - y[i]) yh = w0h + w1h*x msep = np.mean((yh-y)**2) chg = mse - msep converged = chg <= delta mse = msep t += 1 if t % 100 == 0: print "iter", t, "; MSE decreased", chg, '; MSE =', mse, \ '; w0h =', w0h, '; w1h =', w1h return (w0h, w1h, mse, yh, t) delta = 1e-9 eta = 1e-4 w0h, w1h, mse, yh = ls_exact(x,y) w0h_gd, w1h_gd, mse_gd, yh_gd, t_gd = ls_with_gd(x,y,eta,delta) w0h_sgd, w1h_sgd, mse_sgd, yh_sgd, t_sgd = ls_with_sgd(x,y,eta,delta) print w0,w1 print w0h,w1h print w0h_gd,w1h_gd print w0h_sgd,w1h_sgd