Go语言自然语言处理:文本生成与序列模型
自然语言处理(NLP)是人工智能领域的重要分支,涉及理解和生成人类语言。本文将深入探讨如何使用Go语言实现文本生成模型和序列模型。
一、序列模型概述
序列模型是处理序列数据的神经网络模型,在NLP中应用广泛:
- 循环神经网络(RNN):处理序列数据的基础模型
- 长短期记忆网络(LSTM):解决RNN的梯度消失问题
- 门控循环单元(GRU):LSTM的简化版本
- Transformer:基于自注意力机制的革命性模型
二、循环神经网络(RNN)实现
2.1 基础RNN单元
package main import ( "fmt" "math" "math/rand" ) type RNN struct { inputSize int hiddenSize int outputSize int Wxh []float64 // 输入到隐藏层的权重 Whh []float64 // 隐藏层到隐藏层的权重 Why []float64 // 隐藏层到输出层的权重 bh []float64 // 隐藏层偏置 by []float64 // 输出层偏置 hidden []float64 } func NewRNN(inputSize, hiddenSize, outputSize int) *RNN { rnn := &RNN{ inputSize: inputSize, hiddenSize: hiddenSize, outputSize: outputSize, } rnn.Wxh = make([]float64, inputSize*hiddenSize) rnn.Whh = make([]float64, hiddenSize*hiddenSize) rnn.Why = make([]float64, hiddenSize*outputSize) rnn.bh = make([]float64, hiddenSize) rnn.by = make([]float64, outputSize) rnn.hidden = make([]float64, hiddenSize) // 初始化权重 rnn.initWeights() return rnn } func (rnn *RNN) initWeights() { for i := range rnn.Wxh { rnn.Wxh[i] = (rand.Float64() - 0.5) * 2 } for i := range rnn.Whh { rnn.Whh[i] = (rand.Float64() - 0.5) * 2 } for i := range rnn.Why { rnn.Why[i] = (rand.Float64() - 0.5) * 2 } } func (rnn *RNN) forward(input []float64) []float64 { // 计算隐藏层 newHidden := make([]float64, rnn.hiddenSize) for i := 0; i < rnn.hiddenSize; i++ { var sum float64 for j := 0; j < rnn.inputSize; j++ { sum += input[j] * rnn.Wxh[j*rnn.hiddenSize+i] } for j := 0; j < rnn.hiddenSize; j++ { sum += rnn.hidden[j] * rnn.Whh[j*rnn.hiddenSize+i] } sum += rnn.bh[i] newHidden[i] = math.Tanh(sum) } rnn.hidden = newHidden // 计算输出层 output := make([]float64, rnn.outputSize) for i := 0; i < rnn.outputSize; i++ { var sum float64 for j := 0; j < rnn.hiddenSize; j++ { sum += rnn.hidden[j] * rnn.Why[j*rnn.outputSize+i] } sum += rnn.by[i] output[i] = sum } return output } func (rnn *RNN) resetHidden() { for i := range rnn.hidden { rnn.hidden[i] = 0 } }2.2 使用示例
func main() { vocabSize := 26 hiddenSize := 128 rnn := NewRNN(vocabSize, hiddenSize, vocabSize) // 模拟输入序列(字母索引) input := []int{0, 1, 2, 3, 4, 5} rnn.resetHidden() for _, idx := range input { // 将索引转换为one-hot向量 oneHot := make([]float64, vocabSize) oneHot[idx] = 1.0 output := rnn.forward(oneHot) // 找到最大概率的输出 maxIdx := 0 maxVal := output[0] for i, val := range output { if val > maxVal { maxVal = val maxIdx = i } } fmt.Printf("输入: %d -> 输出: %d\n", idx, maxIdx) } }三、LSTM实现
type LSTM struct { inputSize int hiddenSize int Wf []float64 // 遗忘门权重 Wi []float64 // 输入门权重 Wc []float64 // 候选记忆权重 Wo []float64 // 输出门权重 bf []float64 bi []float64 bc []float64 bo []float64 hidden []float64 cell []float64 } func NewLSTM(inputSize, hiddenSize int) *LSTM { lstm := &LSTM{ inputSize: inputSize, hiddenSize: hiddenSize, } size := inputSize + hiddenSize lstm.Wf = make([]float64, size*hiddenSize) lstm.Wi = make([]float64, size*hiddenSize) lstm.Wc = make([]float64, size*hiddenSize) lstm.Wo = make([]float64, size*hiddenSize) lstm.bf = make([]float64, hiddenSize) lstm.bi = make([]float64, hiddenSize) lstm.bc = make([]float64, hiddenSize) lstm.bo = make([]float64, hiddenSize) lstm.hidden = make([]float64, hiddenSize) lstm.cell = make([]float64, hiddenSize) lstm.initWeights() return lstm } func (lstm *LSTM) initWeights() { for i := range lstm.Wf { lstm.Wf[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(lstm.inputSize)) } for i := range lstm.Wi { lstm.Wi[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(lstm.inputSize)) } for i := range lstm.Wc { lstm.Wc[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(lstm.inputSize)) } for i := range lstm.Wo { lstm.Wo[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(lstm.inputSize)) } } func (lstm *LSTM) forward(input []float64) []float64 { // 拼接输入和隐藏状态 concat := make([]float64, lstm.inputSize+lstm.hiddenSize) copy(concat, input) copy(concat[lstm.inputSize:], lstm.hidden) // 计算门控 forgetGate := make([]float64, lstm.hiddenSize) inputGate := make([]float64, lstm.hiddenSize) cellCandidate := make([]float64, lstm.hiddenSize) outputGate := make([]float64, lstm.hiddenSize) for i := 0; i < lstm.hiddenSize; i++ { var sum float64 for j := 0; j < len(concat); j++ { sum += concat[j] * lstm.Wf[j*lstm.hiddenSize+i] } forgetGate[i] = sigmoid(sum + lstm.bf[i]) sum = 0 for j := 0; j < len(concat); j++ { sum += concat[j] * lstm.Wi[j*lstm.hiddenSize+i] } inputGate[i] = sigmoid(sum + lstm.bi[i]) sum = 0 for j := 0; j < len(concat); j++ { sum += concat[j] * lstm.Wc[j*lstm.hiddenSize+i] } cellCandidate[i] = math.Tanh(sum + lstm.bc[i]) sum = 0 for j := 0; j < len(concat); j++ { sum += concat[j] * lstm.Wo[j*lstm.hiddenSize+i] } outputGate[i] = sigmoid(sum + lstm.bo[i]) } // 更新细胞状态 for i := 0; i < lstm.hiddenSize; i++ { lstm.cell[i] = forgetGate[i]*lstm.cell[i] + inputGate[i]*cellCandidate[i] } // 更新隐藏状态 for i := 0; i < lstm.hiddenSize; i++ { lstm.hidden[i] = outputGate[i] * math.Tanh(lstm.cell[i]) } return lstm.hidden } func sigmoid(x float64) float64 { return 1 / (1 + math.Exp(-x)) }四、Transformer实现
4.1 自注意力机制
func ScaledDotProductAttention(Q, K, V [][]float64) [][]float64 { dK := float64(len(K[0])) // 计算注意力分数 scores := make([][]float64, len(Q)) for i := range scores { scores[i] = make([]float64, len(K)) for j := range scores[i] { var sum float64 for k := 0; k < len(Q[0]); k++ { sum += Q[i][k] * K[j][k] } scores[i][j] = sum / math.Sqrt(dK) } } // 应用softmax weights := make([][]float64, len(scores)) for i := range weights { weights[i] = softmax(scores[i]) } // 加权求和 output := make([][]float64, len(Q)) for i := range output { output[i] = make([]float64, len(V[0])) for j := 0; j < len(V[0]); j++ { var sum float64 for k := 0; k < len(weights[i]); k++ { sum += weights[i][k] * V[k][j] } output[i][j] = sum } } return output } func softmax(x []float64) []float64 { maxVal := math.Inf(-1) for _, v := range x { if v > maxVal { maxVal = v } } exp := make([]float64, len(x)) var sum float64 for i, v := range x { exp[i] = math.Exp(v - maxVal) sum += exp[i] } output := make([]float64, len(x)) for i := range output { output[i] = exp[i] / sum } return output }4.2 多头注意力
type MultiHeadAttention struct { numHeads int dModel int dK int dV int WQ []float64 WK []float64 WV []float64 WO []float64 } func NewMultiHeadAttention(numHeads, dModel int) *MultiHeadAttention { mha := &MultiHeadAttention{ numHeads: numHeads, dModel: dModel, dK: dModel / numHeads, dV: dModel / numHeads, } mha.WQ = make([]float64, dModel*dModel) mha.WK = make([]float64, dModel*dModel) mha.WV = make([]float64, dModel*dModel) mha.WO = make([]float64, dModel*dModel) mha.initWeights() return mha } func (mha *MultiHeadAttention) initWeights() { for i := range mha.WQ { mha.WQ[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(mha.dModel)) } for i := range mha.WK { mha.WK[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(mha.dModel)) } for i := range mha.WV { mha.WV[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(mha.dModel)) } for i := range mha.WO { mha.WO[i] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(mha.dModel)) } } func (mha *MultiHeadAttention) forward(Q, K, V [][]float64) [][]float64 { batchSize := len(Q) // 线性变换 Q = mha.linear(Q, mha.WQ) K = mha.linear(K, mha.WK) V = mha.linear(V, mha.WV) // 拆分多头 Q = mha.splitHeads(Q) K = mha.splitHeads(K) V = mha.splitHeads(V) // 计算注意力 output := make([][][]float64, mha.numHeads) for h := 0; h < mha.numHeads; h++ { output[h] = ScaledDotProductAttention(Q[h], K[h], V[h]) } // 合并多头 output = mha.mergeHeads(output) // 输出线性变换 output = mha.linear(output, mha.WO) return output } func (mha *MultiHeadAttention) linear(X [][]float64, W []float64) [][]float64 { output := make([][]float64, len(X)) for i := range output { output[i] = make([]float64, mha.dModel) for j := 0; j < mha.dModel; j++ { var sum float64 for k := 0; k < len(X[0]); k++ { sum += X[i][k] * W[k*mha.dModel+j] } output[i][j] = sum } } return output } func (mha *MultiHeadAttention) splitHeads(X [][]float64) [][][]float64 { heads := make([][][]float64, mha.numHeads) for h := 0; h < mha.numHeads; h++ { heads[h] = make([][]float64, len(X)) for i := range heads[h] { heads[h][i] = make([]float64, mha.dK) for j := 0; j < mha.dK; j++ { heads[h][i][j] = X[i][h*mha.dK+j] } } } return heads } func (mha *MultiHeadAttention) mergeHeads(heads [][][]float64) [][]float64 { output := make([][]float64, len(heads[0])) for i := range output { output[i] = make([]float64, mha.dModel) for h := 0; h < mha.numHeads; h++ { for j := 0; j < mha.dK; j++ { output[i][h*mha.dK+j] = heads[h][i][j] } } } return output }五、文本生成实战
func generateText(model *LSTM, startToken int, length int, vocabSize int) []int { result := []int{startToken} model.hidden = make([]float64, model.hiddenSize) model.cell = make([]float64, model.hiddenSize) // 预热阶段 for _, token := range result { input := make([]float64, vocabSize) input[token] = 1.0 model.forward(input) } // 生成阶段 for i := 0; i < length; i++ { input := make([]float64, vocabSize) input[result[len(result)-1]] = 1.0 output := model.forward(input) // 采样 token := sample(output) result = append(result, token) } return result } func sample(logits []float64) int { // 简单的贪婪采样 maxIdx := 0 maxVal := logits[0] for i, val := range logits { if val > maxVal { maxVal = val maxIdx = i } } return maxIdx }六、词向量嵌入
type Embedding struct { vocabSize int embedDim int weights [][]float64 } func NewEmbedding(vocabSize, embedDim int) *Embedding { emb := &Embedding{ vocabSize: vocabSize, embedDim: embedDim, weights: make([][]float64, vocabSize), } for i := range emb.weights { emb.weights[i] = make([]float64, embedDim) for j := range emb.weights[i] { emb.weights[i][j] = (rand.Float64() - 0.5) * 2 / math.Sqrt(float64(embedDim)) } } return emb } func (emb *Embedding) forward(indices []int) [][]float64 { output := make([][]float64, len(indices)) for i, idx := range indices { output[i] = make([]float64, emb.embedDim) copy(output[i], emb.weights[idx]) } return output }七、总结
本文介绍了序列模型的核心组件及其Go语言实现:
- 循环神经网络(RNN):基础序列建模
- 长短期记忆网络(LSTM):解决梯度消失问题
- Transformer:基于自注意力机制的革命性模型
- 文本生成:使用LSTM生成文本
虽然Go语言在深度学习领域的生态不如Python成熟,但它的高性能特性使其成为部署生产级NLP系统的理想选择。结合Go的并发优势,可以构建高性能的文本处理系统。