Gradient Descent#

import 需要的套件。一般都會全部放在程式的最前面。

import pandas as pd

配合課程說明的案例,用python演示一次。

首先定義需要的function如下:

  1. 定義模型(簡單線性迴歸):參數是截距項、斜率,及模型的input(feature),輸出是對應的模型預測值。

def model(a, b, x):

    y_hat = a + b * x

    return y_hat
  1. 計算損失 & gradient:計算給定參數時(即當前模型)的損失值 & gradient。

def get_loss(data, a, b):

    loss = 0

    for i in range(0, data.shape[0]):
        x, y = data['x'][i], data['y'][i]
        loss += ( y - model(a, b, x))**2

    return loss
def get_gradient(data, a, b):

    gradient = 0

    for i in range(0, data.shape[0]):
        x, y = data['x'][i], data['y'][i]
        gradient += -2 * ( y - model(a, b, x))

    return gradient
  1. 更新參數:根據輸入的gradient,以及給定的learning rate(lr),將目前的參數值更新為新的參數值。

def update_param(param, gradient, lr):

    new_param = param - gradient * lr

    return new_param

定義與課程說明相同的資料如下:

data = pd.DataFrame({
    'x': [0.5, 2.3, 2.9],
    'y': [1.4, 1.9, 3.2]
})

用graident descent更新截距項#

與課程說明相同,假設僅更新截距項。

# step 1: 隨機取值。
a = 0
# 初始化一些variable,供後續繪圖使用。
graph_a = []
graph_loss = []

for i in range(0, 50): # 假設僅迭代50次。
    print(f'>>> iteration: {i}')

    # step 2: 計算gradient。
    gradient = get_gradient(data=data, a=a, b=0.64)
    print(f'gradient = {gradient:.4f}')

    # step 3: 更新參數
    a = update_param(param=a, gradient=gradient, lr=0.1)
    print(f'a = {a:.4f}')

    # 儲存每一次迭代時的截距項,供繪圖用。
    graph_a.append(a)

    # 儲存並計算每一次迭代時的損失,供繪圖用。
    loss = get_loss(data=data, a=a, b=0.64)
    graph_loss.append(loss)
    print(f'loss: {loss:.4f}\n')
>>> iteration: 0
gradient = -5.7040
a = 0.5704
loss: 0.8784

>>> iteration: 1
gradient = -2.2816
a = 0.7986
loss: 0.5140

>>> iteration: 2
gradient = -0.9126
a = 0.8898
loss: 0.4557

>>> iteration: 3
gradient = -0.3651
a = 0.9263
loss: 0.4464

>>> iteration: 4
gradient = -0.1460
a = 0.9409
loss: 0.4449

>>> iteration: 5
gradient = -0.0584
a = 0.9468
loss: 0.4447

>>> iteration: 6
gradient = -0.0234
a = 0.9491
loss: 0.4446

>>> iteration: 7
gradient = -0.0093
a = 0.9500
loss: 0.4446

>>> iteration: 8
gradient = -0.0037
a = 0.9504
loss: 0.4446

>>> iteration: 9
gradient = -0.0015
a = 0.9506
loss: 0.4446

>>> iteration: 10
gradient = -0.0006
a = 0.9506
loss: 0.4446

>>> iteration: 11
gradient = -0.0002
a = 0.9507
loss: 0.4446

>>> iteration: 12
gradient = -0.0001
a = 0.9507
loss: 0.4446

>>> iteration: 13
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 14
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 15
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 16
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 17
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 18
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 19
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 20
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 21
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 22
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 23
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 24
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 25
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 26
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 27
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 28
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 29
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 30
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 31
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 32
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 33
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 34
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 35
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 36
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 37
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 38
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 39
gradient = -0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 40
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 41
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 42
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 43
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 44
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 45
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 46
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 47
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 48
gradient = 0.0000
a = 0.9507
loss: 0.4446

>>> iteration: 49
gradient = 0.0000
a = 0.9507
loss: 0.4446
graph_df = pd.DataFrame({'a': graph_a, 'loss': graph_loss})
graph_df.plot.scatter(x='a', y='loss')
<Axes: xlabel='a', ylabel='loss'>
_images/16bb0ff486353d7683d5d50e7e3601d0dd4f5fece90642c577b76ecabeacdeff.png

可以看到上圖顯示,迭代了50次之後,截距項從0逐漸移動至0.95附近。

用graident descent更新截距項 & 斜率#

接著,下面的程式示範了如何同時更新截距項 & 斜率。

def get_gradients(data, a, b):

    gradient_a = 0
    gradient_b = 0

    for i in range(0, data.shape[0]):
        x, y = data['x'][i], data['y'][i]
        gradient_a += -2 * ( y - model(a, b, x))
        gradient_b += -2 * x * ( y - model(a, b, x))

    return gradient_a, gradient_b
# step 1: 隨機取值。
a = 0
b = 1
# 初始化一些variable,供後續繪圖使用。
graph_a = []
graph_b = []
graph_loss = []

for i in range(0, 701):
    # step 2: 計算gradient。
    gradient_a, gradient_b = get_gradients(data=data, a=a, b=b)

    # step 3: 更新參數
    a = update_param(param=a, gradient=gradient_a, lr=0.01)
    b = update_param(param=b, gradient=gradient_b, lr=0.01)

    # 儲存每一次迭代時的截距項,供繪圖用。
    graph_a.append(a)
    graph_b.append(b)

    # 儲存並計算每一次迭代時的損失,供繪圖用。
    loss = get_loss(data=data, a=a, b=b)
    graph_loss.append(loss)

    # 因為迭代的次數較多,讓程式每迭代10次才print內容。
    if i % 10 == 0:
        print(f'>>> iteration: {i}')
        print(f'gradient_a = {gradient_a:.4f}, gradient_b = {gradient_b:.4f}')
        print(f'a = {a:.4f}, b = {b:.4f}')
        print(f'loss: {loss:.4f}')
        print('')
>>> iteration: 0
gradient_a = -1.6000, gradient_b = -0.8000
a = 0.0160, b = 1.0080
loss: 1.0311

>>> iteration: 10
gradient_a = -0.9601, gradient_b = 0.3810
a = 0.1269, b = 0.9906
loss: 0.9005

>>> iteration: 20
gradient_a = -0.8470, gradient_b = 0.3603
a = 0.2163, b = 0.9531
loss: 0.8068

>>> iteration: 30
gradient_a = -0.7549, gradient_b = 0.3216
a = 0.2958, b = 0.9192
loss: 0.7324

>>> iteration: 40
gradient_a = -0.6729, gradient_b = 0.2867
a = 0.3667, b = 0.8890
loss: 0.6733

>>> iteration: 50
gradient_a = -0.5998, gradient_b = 0.2556
a = 0.4299, b = 0.8621
loss: 0.6263

>>> iteration: 60
gradient_a = -0.5347, gradient_b = 0.2278
a = 0.4863, b = 0.8381
loss: 0.5890

>>> iteration: 70
gradient_a = -0.4766, gradient_b = 0.2031
a = 0.5365, b = 0.8167
loss: 0.5594

>>> iteration: 80
gradient_a = -0.4249, gradient_b = 0.1810
a = 0.5812, b = 0.7976
loss: 0.5358

>>> iteration: 90
gradient_a = -0.3787, gradient_b = 0.1614
a = 0.6212, b = 0.7806
loss: 0.5171

>>> iteration: 100
gradient_a = -0.3376, gradient_b = 0.1438
a = 0.6567, b = 0.7654
loss: 0.5022

>>> iteration: 110
gradient_a = -0.3009, gradient_b = 0.1282
a = 0.6884, b = 0.7519
loss: 0.4904

>>> iteration: 120
gradient_a = -0.2683, gradient_b = 0.1143
a = 0.7167, b = 0.7399
loss: 0.4810

>>> iteration: 130
gradient_a = -0.2391, gradient_b = 0.1019
a = 0.7419, b = 0.7291
loss: 0.4735

>>> iteration: 140
gradient_a = -0.2132, gradient_b = 0.0908
a = 0.7644, b = 0.7196
loss: 0.4676

>>> iteration: 150
gradient_a = -0.1900, gradient_b = 0.0810
a = 0.7844, b = 0.7110
loss: 0.4629

>>> iteration: 160
gradient_a = -0.1694, gradient_b = 0.0722
a = 0.8022, b = 0.7034
loss: 0.4591

>>> iteration: 170
gradient_a = -0.1510, gradient_b = 0.0643
a = 0.8181, b = 0.6967
loss: 0.4561

>>> iteration: 180
gradient_a = -0.1346, gradient_b = 0.0573
a = 0.8323, b = 0.6906
loss: 0.4538

>>> iteration: 190
gradient_a = -0.1200, gradient_b = 0.0511
a = 0.8450, b = 0.6852
loss: 0.4519

>>> iteration: 200
gradient_a = -0.1069, gradient_b = 0.0456
a = 0.8562, b = 0.6804
loss: 0.4504

>>> iteration: 210
gradient_a = -0.0953, gradient_b = 0.0406
a = 0.8663, b = 0.6762
loss: 0.4492

>>> iteration: 220
gradient_a = -0.0850, gradient_b = 0.0362
a = 0.8752, b = 0.6723
loss: 0.4483

>>> iteration: 230
gradient_a = -0.0758, gradient_b = 0.0323
a = 0.8832, b = 0.6689
loss: 0.4475

>>> iteration: 240
gradient_a = -0.0675, gradient_b = 0.0288
a = 0.8903, b = 0.6659
loss: 0.4469

>>> iteration: 250
gradient_a = -0.0602, gradient_b = 0.0256
a = 0.8967, b = 0.6632
loss: 0.4464

>>> iteration: 260
gradient_a = -0.0537, gradient_b = 0.0229
a = 0.9023, b = 0.6608
loss: 0.4461

>>> iteration: 270
gradient_a = -0.0478, gradient_b = 0.0204
a = 0.9073, b = 0.6587
loss: 0.4458

>>> iteration: 280
gradient_a = -0.0426, gradient_b = 0.0182
a = 0.9118, b = 0.6567
loss: 0.4455

>>> iteration: 290
gradient_a = -0.0380, gradient_b = 0.0162
a = 0.9158, b = 0.6550
loss: 0.4453

>>> iteration: 300
gradient_a = -0.0339, gradient_b = 0.0144
a = 0.9194, b = 0.6535
loss: 0.4452

>>> iteration: 310
gradient_a = -0.0302, gradient_b = 0.0129
a = 0.9226, b = 0.6522
loss: 0.4451

>>> iteration: 320
gradient_a = -0.0269, gradient_b = 0.0115
a = 0.9254, b = 0.6509
loss: 0.4450

>>> iteration: 330
gradient_a = -0.0240, gradient_b = 0.0102
a = 0.9280, b = 0.6499
loss: 0.4449

>>> iteration: 340
gradient_a = -0.0214, gradient_b = 0.0091
a = 0.9302, b = 0.6489
loss: 0.4448

>>> iteration: 350
gradient_a = -0.0191, gradient_b = 0.0081
a = 0.9322, b = 0.6481
loss: 0.4448

>>> iteration: 360
gradient_a = -0.0170, gradient_b = 0.0072
a = 0.9340, b = 0.6473
loss: 0.4448

>>> iteration: 370
gradient_a = -0.0152, gradient_b = 0.0065
a = 0.9356, b = 0.6466
loss: 0.4447

>>> iteration: 380
gradient_a = -0.0135, gradient_b = 0.0058
a = 0.9370, b = 0.6460
loss: 0.4447

>>> iteration: 390
gradient_a = -0.0120, gradient_b = 0.0051
a = 0.9383, b = 0.6455
loss: 0.4447

>>> iteration: 400
gradient_a = -0.0107, gradient_b = 0.0046
a = 0.9394, b = 0.6450
loss: 0.4447

>>> iteration: 410
gradient_a = -0.0096, gradient_b = 0.0041
a = 0.9404, b = 0.6446
loss: 0.4447

>>> iteration: 420
gradient_a = -0.0085, gradient_b = 0.0036
a = 0.9413, b = 0.6442
loss: 0.4447

>>> iteration: 430
gradient_a = -0.0076, gradient_b = 0.0032
a = 0.9421, b = 0.6438
loss: 0.4446

>>> iteration: 440
gradient_a = -0.0068, gradient_b = 0.0029
a = 0.9429, b = 0.6435
loss: 0.4446

>>> iteration: 450
gradient_a = -0.0060, gradient_b = 0.0026
a = 0.9435, b = 0.6433
loss: 0.4446

>>> iteration: 460
gradient_a = -0.0054, gradient_b = 0.0023
a = 0.9441, b = 0.6430
loss: 0.4446

>>> iteration: 470
gradient_a = -0.0048, gradient_b = 0.0020
a = 0.9446, b = 0.6428
loss: 0.4446

>>> iteration: 480
gradient_a = -0.0043, gradient_b = 0.0018
a = 0.9450, b = 0.6426
loss: 0.4446

>>> iteration: 490
gradient_a = -0.0038, gradient_b = 0.0016
a = 0.9454, b = 0.6424
loss: 0.4446

>>> iteration: 500
gradient_a = -0.0034, gradient_b = 0.0014
a = 0.9458, b = 0.6423
loss: 0.4446

>>> iteration: 510
gradient_a = -0.0030, gradient_b = 0.0013
a = 0.9461, b = 0.6421
loss: 0.4446

>>> iteration: 520
gradient_a = -0.0027, gradient_b = 0.0012
a = 0.9464, b = 0.6420
loss: 0.4446

>>> iteration: 530
gradient_a = -0.0024, gradient_b = 0.0010
a = 0.9466, b = 0.6419
loss: 0.4446

>>> iteration: 540
gradient_a = -0.0021, gradient_b = 0.0009
a = 0.9469, b = 0.6418
loss: 0.4446

>>> iteration: 550
gradient_a = -0.0019, gradient_b = 0.0008
a = 0.9471, b = 0.6417
loss: 0.4446

>>> iteration: 560
gradient_a = -0.0017, gradient_b = 0.0007
a = 0.9472, b = 0.6417
loss: 0.4446

>>> iteration: 570
gradient_a = -0.0015, gradient_b = 0.0006
a = 0.9474, b = 0.6416
loss: 0.4446

>>> iteration: 580
gradient_a = -0.0014, gradient_b = 0.0006
a = 0.9475, b = 0.6415
loss: 0.4446

>>> iteration: 590
gradient_a = -0.0012, gradient_b = 0.0005
a = 0.9477, b = 0.6415
loss: 0.4446

>>> iteration: 600
gradient_a = -0.0011, gradient_b = 0.0005
a = 0.9478, b = 0.6414
loss: 0.4446

>>> iteration: 610
gradient_a = -0.0010, gradient_b = 0.0004
a = 0.9479, b = 0.6414
loss: 0.4446

>>> iteration: 620
gradient_a = -0.0009, gradient_b = 0.0004
a = 0.9480, b = 0.6413
loss: 0.4446

>>> iteration: 630
gradient_a = -0.0008, gradient_b = 0.0003
a = 0.9481, b = 0.6413
loss: 0.4446

>>> iteration: 640
gradient_a = -0.0007, gradient_b = 0.0003
a = 0.9481, b = 0.6413
loss: 0.4446

>>> iteration: 650
gradient_a = -0.0006, gradient_b = 0.0003
a = 0.9482, b = 0.6412
loss: 0.4446

>>> iteration: 660
gradient_a = -0.0005, gradient_b = 0.0002
a = 0.9483, b = 0.6412
loss: 0.4446

>>> iteration: 670
gradient_a = -0.0005, gradient_b = 0.0002
a = 0.9483, b = 0.6412
loss: 0.4446

>>> iteration: 680
gradient_a = -0.0004, gradient_b = 0.0002
a = 0.9483, b = 0.6412
loss: 0.4446

>>> iteration: 690
gradient_a = -0.0004, gradient_b = 0.0002
a = 0.9484, b = 0.6412
loss: 0.4446

>>> iteration: 700
gradient_a = -0.0003, gradient_b = 0.0001
a = 0.9484, b = 0.6412
loss: 0.4446
graph_df = pd.DataFrame({'a': graph_a, 'b': graph_b, 'loss': graph_loss})
graph_df.plot.scatter(x='a', y='loss')
<Axes: xlabel='a', ylabel='loss'>
_images/82a9a6bfc194b565db53562464f33c0e477b0b0fa8a8270e594931b69fa08a95.png
graph_df.plot.scatter(x='b', y='loss')
<Axes: xlabel='b', ylabel='loss'>
_images/704cd8d18be7d4b03b773fe701a73e1ff028d0c4863a95a597d86a18588ee5c1.png
print(f'final a: {graph_a[-1]}')
print(f'final b: {graph_b[-1]}')
final a: 0.9484228728799461
final b: 0.6411513598871877

依照上面的結果,gradient descent方法,找到的a是接近0.95,b則是約0.64。

接下來測試如果使用傳統OLS方法,估計出來的數值是否接近。

OLS 結果比較#

import statsmodels.formula.api as smf
%matplotlib inline

這邊使用的套件是statsmodels,可以用類似寫方程式的方法來跑統計模型。

result = smf.ols('y ~ x', data=data).fit()

結果如下:

估計結果其實很接近gradient descent的估計方法。

print(result.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.743
Model:                            OLS   Adj. R-squared:                  0.485
Method:                 Least Squares   F-statistic:                     2.884
Date:                Thu, 28 Mar 2024   Prob (F-statistic):              0.339
Time:                        07:29:54   Log-Likelihood:                -1.3931
No. Observations:                   3   AIC:                             6.786
Df Residuals:                       1   BIC:                             4.983
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9487      0.814      1.165      0.451      -9.395      11.292
x              0.6410      0.377      1.698      0.339      -4.156       5.438
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   2.846
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.392
Skew:                          -0.470   Prob(JB):                        0.822
Kurtosis:                       1.500   Cond. No.                         5.35
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/Users/curtislu/Library/Caches/pypoetry/virtualenvs/ntu-ps-intro-ml-GyDE60mE-py3.11/lib/python3.11/site-packages/statsmodels/stats/stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 3 samples were given.
  warn("omni_normtest is not valid with less than 8 observations; %i "

也可以使用Scikit-learn套件的LinearRegression來做到一樣的結果。

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Creating a Linear Regression model object
model = LinearRegression()

# Fitting the model to the data (data需整理成符合規格的形狀)
model.fit(data['x'].values.reshape(-1, 1), data['y'].values)

# Once fitted, you can get the coefficients and intercept of the linear regression line
print(f"Coefficient (slope): {model.coef_}")
print(f"Intercept: {model.intercept_}")
Coefficient (slope): [0.64102564]
Intercept: 0.9487179487179489