Run this notebook online: or Colab:

# 4.5. 权重衰减¶

## 4.5.1. 范数与权重衰减¶

(4.5.1)$L(\mathbf{w}, b) = \frac{1}{n}\sum_{i=1}^n \frac{1}{2}\left(\mathbf{w}^\top \mathbf{x}^{(i)} + b - y^{(i)}\right)^2.$

(4.5.2)$L(\mathbf{w}, b) + \frac{\lambda}{2} \|\mathbf{w}\|^2,$

(4.5.3)\begin{aligned} \mathbf{w} & \leftarrow \left(1- \eta\lambda \right) \mathbf{w} - \frac{\eta}{|\mathcal{B}|} \sum_{i \in \mathcal{B}} \mathbf{x}^{(i)} \left(\mathbf{w}^\top \mathbf{x}^{(i)} + b - y^{(i)}\right). \end{aligned}

## 4.5.2. 高维线性回归¶

%load ../utils/djl-imports

import org.apache.commons.lang3.ArrayUtils;

int nTrain = 20;
int nTest = 100;
int numInputs = 200;
int batchSize = 5;

float trueB = 0.05f;
NDManager manager = NDManager.newBaseManager();
NDArray trueW = manager.ones(new Shape(numInputs, 1));
trueW = trueW.mul(0.01);

public ArrayDataset loadArray(NDArray features, NDArray labels, int batchSize, boolean shuffle) {
return new ArrayDataset.Builder()
.setData(features) // set the features
.optLabels(labels) // set the labels
.setSampling(batchSize, shuffle) // set the batch size and random sampling
.build();
}

DataPoints trainData = DataPoints.syntheticData(manager, trueW, trueB, nTrain);

ArrayDataset trainIter = loadArray(trainData.getX(), trainData.getY(), batchSize, true);

DataPoints testData = DataPoints.syntheticData(manager, trueW, trueB, nTest);

ArrayDataset testIter = loadArray(testData.getX(), testData.getY(), batchSize, false);


(4.5.4)$y = 0.05 + \sum_{i = 1}^d 0.01 x_i + \epsilon \text{ where } \epsilon \sim \mathcal{N}(0, 0.01^2).$

## 4.5.3. 从零开始实现¶

### 4.5.3.1. 初始化模型参数¶

public class InitParams{

private NDArray w;
private NDArray b;
private NDList l;

public NDArray getW(){
return this.w;
}

public NDArray getB(){
return this.b;
}

public InitParams(){
NDManager manager = NDManager.newBaseManager();
w = manager.randomNormal(0, 1.0f, new Shape(numInputs, 1), DataType.FLOAT32);
b = manager.zeros(new Shape(1));
}
}


### 4.5.3.2. 定义$$L_2$$范数惩罚¶

public NDArray l2Penalty(NDArray w){
return ((w.pow(2)).sum()).div(2);
}

Loss l2loss = Loss.l2Loss();


### 4.5.3.3. 定义训练代码实现¶

double[] trainLoss;
double[] testLoss;
double[] epochCount;

public void train(float lambd) throws IOException, TranslateException {

InitParams initParams = new InitParams();

NDList params = new NDList(initParams.getW(), initParams.getB());

int numEpochs = Integer.getInteger("MAX_EPOCH", 100);
float lr = 0.003f;

trainLoss = new double[(numEpochs/5)];
testLoss = new double[(numEpochs/5)];
epochCount = new double[(numEpochs/5)];

for(int epoch = 1; epoch <= numEpochs; epoch++){

for(Batch batch : trainIter.getData(manager)){

NDArray w = params.get(0);
NDArray b = params.get(1);

// makes l2Penalty(w) a vector whose length is batch_size
NDArray l = Training.squaredLoss(Training.linreg(X, w, b), y).add(l2Penalty(w).mul(lambd));
gc.backward(l);  // Compute gradient on l with respect to w and b

}

batch.close();
Training.sgd(params, lr, batchSize);  // Update parameters using their gradient
}

if(epoch % 5 == 0){
NDArray testL = Training.squaredLoss(Training.linreg(testData.getX(), params.get(0), params.get(1)), testData.getY());
NDArray trainL = Training.squaredLoss(Training.linreg(trainData.getX(), params.get(0), params.get(1)), trainData.getY());

epochCount[epoch/5 - 1] = epoch;
trainLoss[epoch/5 -1] = trainL.mean().log10().getFloat();
testLoss[epoch/5 -1] = testL.mean().log10().getFloat();
}

}

System.out.println("l1 norm of w: " + params.get(0).abs().sum());
}


### 4.5.3.4. 忽略正则化直接训练¶

train(0f);

String[] lossLabel = new String[trainLoss.length + testLoss.length];

Arrays.fill(lossLabel, 0, testLoss.length, "test");
Arrays.fill(lossLabel, testLoss.length, trainLoss.length + testLoss.length, "train");

StringColumn.create("lossLabel", lossLabel)
);

render(LinePlot.create("", data, "epochCount", "loss", "lossLabel"),"text/html");

l1 norm of w: ND: () gpu(0) float32
161.0816


### 4.5.3.5. 使用权重衰减¶

// calling training with weight decay lambda = 3.0
train(3f);

String[] lossLabel = new String[trainLoss.length + testLoss.length];

Arrays.fill(lossLabel, 0, testLoss.length, "test");
Arrays.fill(lossLabel, testLoss.length, trainLoss.length + testLoss.length, "train");

StringColumn.create("lossLabel", lossLabel)
);

render(LinePlot.create("", data, "epochCount", "loss", "lossLabel"),"text/html");

l1 norm of w: ND: () gpu(0) float32
3.8813


## 4.5.4. 简洁实现¶

public void train_djl(float wd) throws IOException, TranslateException {

InitParams initParams = new InitParams();

NDList params = new NDList(initParams.getW(), initParams.getB());

int numEpochs = Integer.getInteger("MAX_EPOCH", 100);
float lr = 0.003f;

trainLoss = new double[(numEpochs/5)];
testLoss = new double[(numEpochs/5)];
epochCount = new double[(numEpochs/5)];

Tracker lrt = Tracker.fixed(lr);
Optimizer sgd = Optimizer.sgd().setLearningRateTracker(lrt).build();

Model model = Model.newInstance("mlp");

DefaultTrainingConfig config = new DefaultTrainingConfig(l2loss)
.optOptimizer(sgd) // Optimizer (loss function)
.optDevices(model.getNDManager().getEngine().getDevices(1)) // single CPU/GPU

SequentialBlock net = new SequentialBlock();
Linear linearBlock = Linear.builder().optBias(true).setUnits(1).build();

model.setBlock(net);
Trainer trainer = model.newTrainer(config);

trainer.initialize(new Shape(batchSize, 2));
for(int epoch = 1; epoch <= numEpochs; epoch++){

for(Batch batch : trainer.iterateDataset(trainIter)){

NDArray w = params.get(0);
NDArray b = params.get(1);

// Minibatch loss in X and y
NDArray l = Training.squaredLoss(Training.linreg(X, w, b), y).add(l2Penalty(w).mul(wd));
gc.backward(l);  // Compute gradient on l with respect to w and b

}
batch.close();
Training.sgd(params, lr, batchSize);  // Update parameters using their gradient
}

if(epoch % 5 == 0){
NDArray testL = Training.squaredLoss(Training.linreg(testData.getX(), params.get(0), params.get(1)), testData.getY());
NDArray trainL = Training.squaredLoss(Training.linreg(trainData.getX(), params.get(0), params.get(1)), trainData.getY());

epochCount[epoch/5 - 1] = epoch;
trainLoss[epoch/5 -1] = trainL.mean().log10().getFloat();
testLoss[epoch/5 -1] = testL.mean().log10().getFloat();
}

}
System.out.println("l1 norm of w: " + params.get(0).abs().sum());
}


train_djl(0);

String[] lossLabel = new String[trainLoss.length + testLoss.length];

Arrays.fill(lossLabel, 0, testLoss.length, "test");
Arrays.fill(lossLabel, testLoss.length, trainLoss.length + testLoss.length, "train");

StringColumn.create("lossLabel", lossLabel)
);

render(LinePlot.create("", data, "epochCount", "loss", "lossLabel"),"text/html");

INFO Training on: 1 GPUs.
INFO Load MXNet Engine Version 1.9.0 in 0.064 ms.

l1 norm of w: ND: () gpu(0) float32
145.8692

train_djl(0);

String[] lossLabel = new String[trainLoss.length + testLoss.length];

Arrays.fill(lossLabel, 0, testLoss.length, "test");
Arrays.fill(lossLabel, testLoss.length, trainLoss.length + testLoss.length, "train");

StringColumn.create("lossLabel", lossLabel)
);

render(LinePlot.create("", data, "epochCount", "loss", "lossLabel"),"text/html");

INFO Training on: 1 GPUs.
INFO Load MXNet Engine Version 1.9.0 in 0.018 ms.

l1 norm of w: ND: () gpu(0) float32
157.4129


## 4.5.5. 小结¶

• 正则化是处理过拟合的常用方法。在训练集的损失函数中加入惩罚项，以降低学习到的模型的复杂度。

• 保持模型简单的一个特别的选择是使用$$L_2$$惩罚的权重衰减。这会导致学习算法更新步骤中的权重衰减。

• 权重衰减功能在DJL的优化器中提供。

• 在同一训练代码实现中，不同的参数集可以有不同的更新行为。

## 4.5.6. 练习¶

1. 在本节的估计问题中使用$$\lambda$$的值进行实验。绘制训练和测试准确率关于$$\lambda$$的函数。你观察到了什么？

2. 使用验证集来找到最佳值$$\lambda$$。它真的是最优值吗？这有关系吗？

3. 如果我们使用$$\sum_i |w_i|$$作为我们选择的惩罚（$$L_1$$正则化），那么更新方程会是什么样子？

4. 我们知道$$\|\mathbf{w}\|^2 = \mathbf{w}^\top \mathbf{w}$$。你能找到类似的矩阵方程吗（见 Section 2.3.10 中的弗罗贝尼乌斯范数）？

5. 回顾训练误差和泛化误差之间的关系。除了权重衰减、增加训练数据、使用适当复杂度的模型之外，你还能想出其他什么方法来处理过拟合？

6. 在贝叶斯统计中，我们使用先验和似然的乘积，通过公式$$P(w \mid x) \propto P(x \mid w) P(w)$$得到后验。如何得到带正则化的$$P(w)$$