Run this notebook online: or Colab:

# 8.3. 语言模型和数据集¶

Section 8.2中， 我们了解了如何将文本数据映射为词元， 以及将这些词元可以视为一系列离散的观测，例如单词或字符。 假设长度为$$T$$的文本序列中的词元依次为$$x_1, x_2, \ldots, x_T$$。 于是，$$x_t$$$$1 \leq t \leq T$$） 可以被认为是文本序列在时间步$$t$$处的观测或标签。 在给定这样的文本序列时，语言模型（language model）的目标是估计序列的联合概率

(8.3.1)$P(x_1, x_2, \ldots, x_T).$

## 8.3.1. 学习语言模型¶

(8.3.2)$P(x_1, x_2, \ldots, x_T) = \prod_{t=1}^T P(x_t \mid x_1, \ldots, x_{t-1}).$

(8.3.3)$P(\text{deep}, \text{learning}, \text{is}, \text{fun}) = P(\text{deep}) P(\text{learning} \mid \text{deep}) P(\text{is} \mid \text{deep}, \text{learning}) P(\text{fun} \mid \text{deep}, \text{learning}, \text{is}).$

(8.3.4)$\hat{P}(\text{learning} \mid \text{deep}) = \frac{n(\text{deep, learning})}{n(\text{deep})},$

(8.3.5)\begin{split}\begin{aligned} \hat{P}(x) & = \frac{n(x) + \epsilon_1/m}{n + \epsilon_1}, \\ \hat{P}(x' \mid x) & = \frac{n(x, x') + \epsilon_2 \hat{P}(x')}{n(x) + \epsilon_2}, \\ \hat{P}(x'' \mid x,x') & = \frac{n(x, x',x'') + \epsilon_3 \hat{P}(x'')}{n(x, x') + \epsilon_3}. \end{aligned}\end{split}

## 8.3.2. 马尔可夫模型与$$n$$元语法¶

(8.3.6)\begin{split}\begin{aligned} P(x_1, x_2, x_3, x_4) &= P(x_1) P(x_2) P(x_3) P(x_4),\\ P(x_1, x_2, x_3, x_4) &= P(x_1) P(x_2 \mid x_1) P(x_3 \mid x_2) P(x_4 \mid x_3),\\ P(x_1, x_2, x_3, x_4) &= P(x_1) P(x_2 \mid x_1) P(x_3 \mid x_1, x_2) P(x_4 \mid x_2, x_3). \end{aligned}\end{split}

## 8.3.3. 自然语言统计¶

%load ../utils/djl-imports


NDManager manager = NDManager.newBaseManager();

String[][] tokens = TimeMachine.tokenize(TimeMachine.readTimeMachine(), "word");
// 由于每一行文字不一定是一个句子或段落，因此我们把所有文本行拼接到一起
List<String> corpus = new ArrayList<>();
for (int i = 0; i < tokens.length; i++) {
for (int j = 0; j < tokens[i].length; j++) {
if (tokens[i][j] != "") {
}
}
}

Vocab vocab = new Vocab(new String[][] {corpus.toArray(new String[0])}, -1, new String[0]);
for (int i = 0; i < 10; i++) {
Map.Entry<String, Integer> token = vocab.tokenFreqs.get(i);
System.out.println(token.getKey() + ": " + token.getValue());
}

the: 2261
i: 1267
and: 1245
of: 1155
a: 816
to: 695
was: 552
in: 541
that: 443
my: 440


int n = vocab.tokenFreqs.size();
double[] freqs = new double[n];
double[] x = new double[n];
for (int i = 0; i < n; i++) {
freqs[i] = (double) vocab.tokenFreqs.get(i).getValue();
x[i] = (double) i;
}

PlotUtils.plotLogScale(new double[][] {x}, new double[][] {freqs}, new String[] {""},
"token: x", "frequency: n(x)");


(8.3.7)$n_i \propto \frac{1}{i^\alpha},$

(8.3.8)$\log n_i = -\alpha \log i + c,$

String[] bigramTokens = new String[corpus.size()-1];
for (int i = 0; i < bigramTokens.length; i++) {
bigramTokens[i] = corpus.get(i) + " " + corpus.get(i+1);
}
Vocab bigramVocab = new Vocab(new String[][] {bigramTokens}, -1, new String[0]);
for (int i = 0; i < 10; i++) {
Map.Entry<String, Integer> token = bigramVocab.tokenFreqs.get(i);
System.out.println(token.getKey() + ": " + token.getValue());
}

of the: 309
in the: 169
i was: 112
and the: 109
the time: 102
it was: 99
to the: 85
as i: 78
of a: 73


String[] trigramTokens = new String[corpus.size()-2];
for (int i = 0; i < trigramTokens.length; i++) {
trigramTokens[i] = corpus.get(i) + " " + corpus.get(i+1) + " " + corpus.get(i+2);
}
Vocab trigramVocab = new Vocab(new String[][] {trigramTokens}, -1, new String[0]);
for (int i = 0; i < 10; i++) {
Map.Entry<String, Integer> token = trigramVocab.tokenFreqs.get(i);
System.out.println(token.getKey() + ": " + token.getValue());
}

the time traveller: 59
the time machine: 30
: 26
the medical man: 24
it seemed to: 16
it was a: 15
here and there: 15
seemed to me: 14
i did not: 14
i saw the: 13


n = bigramVocab.tokenFreqs.size();
double[] bigramFreqs = new double[n];
double[] bigramX = new double[n];
for (int i = 0; i < n; i++) {
bigramFreqs[i] = (double) bigramVocab.tokenFreqs.get(i).getValue();
bigramX[i] = (double) i;
}

n = trigramVocab.tokenFreqs.size();
double[] trigramFreqs = new double[n];
double[] trigramX = new double[n];
for (int i = 0; i < n; i++) {
trigramFreqs[i] = (double) trigramVocab.tokenFreqs.get(i).getValue();
trigramX[i] = (double) i;
}

PlotUtils.plotLogScale(new double[][] {x, bigramX, trigramX}, new double[][] {freqs, bigramFreqs, trigramFreqs},
new String[] {"unigram", "bigram", "trigram"}, "token: x", "frequency: n(x)");


## 8.3.4. 读取长序列数据¶

### 8.3.4.1. 随机采样¶

/**
* 使用随机抽样生成一小批 子序列。
*/
public ArrayList<NDList>
seqDataIterRandom(List<Integer> corpus, int batchSize, int numSteps, NDManager manager) {
// 从一个随机偏移量（包括'numSteps-1'）开始到分区a
// 序列
corpus = corpus.subList(new Random().nextInt(numSteps - 1), corpus.size());
// 减去1，因为我们需要考虑标签
int numSubseqs = (corpus.size() - 1) / numSteps;
// numSteps 长度子序列的起始指数
List<Integer> initialIndices = new ArrayList<>();
for (int i = 0; i < numSubseqs * numSteps; i += numSteps) {
}
// 在随机抽样中，来自两个相邻随机序列的子序列
// 迭代过程中的小批量不一定在
// 原始序列
Collections.shuffle(initialIndices);

int numBatches = numSubseqs / batchSize;

ArrayList<NDList> pairs = new ArrayList<NDList>();
for (int i = 0; i < batchSize * numBatches; i += batchSize) {
// 这里，initialIndices 包含的是
// 子序列
List<Integer> initialIndicesPerBatch = initialIndices.subList(i, i + batchSize);

NDArray xNDArray = manager.create(new Shape(initialIndices.size(), numSteps), DataType.INT32);
NDArray yNDArray = manager.create(new Shape(initialIndices.size(), numSteps), DataType.INT32);
for (int j = 0; j < initialIndices.size(); j++) {
ArrayList<Integer> X = data(initialIndices.get(j), corpus, numSteps);
xNDArray.set(new NDIndex(j), manager.create(X.stream().mapToInt(Integer::intValue).toArray()));
ArrayList<Integer> Y = data(initialIndices.get(j)+1, corpus, numSteps);
yNDArray.set(new NDIndex(j), manager.create(Y.stream().mapToInt(Integer::intValue).toArray()));
}
NDList pair = new NDList();
}
return pairs;
}

ArrayList<Integer> data(int pos, List<Integer> corpus, int numSteps) {
// 返回从pos 开始的长度为numSteps 的序列
return new ArrayList<Integer>(corpus.subList(pos, pos + numSteps));
}


List<Integer> mySeq = new ArrayList<>();
for (int i = 0; i < 35; i++) {
}

for (NDList pair : seqDataIterRandom(mySeq, 2, 5, manager)) {
System.out.println("X:\n" + pair.get(0).toDebugString(50, 50, 50, 50, true));
System.out.println("Y:\n" + pair.get(1).toDebugString(50, 50, 50, 50, true));
}

X:
ND: (6, 5) gpu(0) int32
[[ 8,  9, 10, 11, 12],
[13, 14, 15, 16, 17],
[ 3,  4,  5,  6,  7],
[28, 29, 30, 31, 32],
[18, 19, 20, 21, 22],
[23, 24, 25, 26, 27],
]

Y:
ND: (6, 5) gpu(0) int32
[[ 9, 10, 11, 12, 13],
[14, 15, 16, 17, 18],
[ 4,  5,  6,  7,  8],
[29, 30, 31, 32, 33],
[19, 20, 21, 22, 23],
[24, 25, 26, 27, 28],
]

X:
ND: (6, 5) gpu(0) int32
[[ 8,  9, 10, 11, 12],
[13, 14, 15, 16, 17],
[ 3,  4,  5,  6,  7],
[28, 29, 30, 31, 32],
[18, 19, 20, 21, 22],
[23, 24, 25, 26, 27],
]

Y:
ND: (6, 5) gpu(0) int32
[[ 9, 10, 11, 12, 13],
[14, 15, 16, 17, 18],
[ 4,  5,  6,  7,  8],
[29, 30, 31, 32, 33],
[19, 20, 21, 22, 23],
[24, 25, 26, 27, 28],
]

X:
ND: (6, 5) gpu(0) int32
[[ 8,  9, 10, 11, 12],
[13, 14, 15, 16, 17],
[ 3,  4,  5,  6,  7],
[28, 29, 30, 31, 32],
[18, 19, 20, 21, 22],
[23, 24, 25, 26, 27],
]

Y:
ND: (6, 5) gpu(0) int32
[[ 9, 10, 11, 12, 13],
[14, 15, 16, 17, 18],
[ 4,  5,  6,  7,  8],
[29, 30, 31, 32, 33],
[19, 20, 21, 22, 23],
[24, 25, 26, 27, 28],
]


### 8.3.4.2. 顺序分区¶

/**
* 使用顺序分区生成一小批 子序列。
*/
public ArrayList<NDList> seqDataIterSequential(List<Integer> corpus, int batchSize, int numSteps,
NDManager manager) {
// 从随机偏移量开始划分序列
int offset = new Random().nextInt(numSteps);
int numTokens = ((corpus.size() - offset - 1) / batchSize) * batchSize;

NDArray Xs = manager.create(
corpus.subList(offset, offset + numTokens).stream().mapToInt(Integer::intValue).toArray());
NDArray Ys = manager.create(
corpus.subList(offset + 1, offset + 1 + numTokens).stream().mapToInt(Integer::intValue).toArray());
Xs = Xs.reshape(new Shape(batchSize, -1));
Ys = Ys.reshape(new Shape(batchSize, -1));
int numBatches = (int) Xs.getShape().get(1) / numSteps;

ArrayList