In [1]:
import numpy as np
import math, json, time, types, copy, sys, os
import torch
from torch.nn import functional as F
import torch.nn as nn

from transformers import PreTrainedTokenizerFast

np.set_printoptions(precision=4, suppress=True, linewidth=200)

In [2]:
RUN_DEVICE = 'cpu' # cpu cuda
ctx_len = 768
n_layer = 12
n_embd = 768
# n_layer = 24
# n_embd = 1024

# ---> download RWKV-3 169M model from https://huggingface.co/BlinkDL/rwkv-3-pile-169m/tree/main

# MODEL_NAME = '/data1/ckw/RWKV-3-Pile-430M-20220817-10602'
MODEL_NAME = '/data1/ckw/RWKV-3-Pile-20220720-10704'
K_EPS = 1e-8

vocab_size = 50277
VOCAB_NAME = '20B_tokenizer.json'

print(f'\n* running on {RUN_DEVICE}')


* running on cpu


在v3版本中,对RWKV模型的TimeMix和ChannelMix部分进行了一些修改,主要变化如下:

1. **ChannelMix**:
 - 在v2版本中,ChannelMix模块的计算中,采用了一个时间混合的技巧,即通过时间滑窗对输入进行一定程度的平滑处理。而在v3版本中,这一技巧被移除,不再使用时间混合。
 - 另外,ChannelMix模块中的`time_mix_k`和`time_mix_r`参数在v3版本中仍然存在,但在v2版本中不存在。

2. **TimeMix**:
 - 在v2版本中,TimeMix模块的计算中,同样使用了时间混合技巧,并且采用了`time_mix`参数来控制时间混合的程度。而在v3版本中,这一技巧被移除,不再使用时间混合。
 - 同时,v3版本中的TimeMix模块取消了`time_mix`参数,而是直接在计算中采用了时间滑窗对输入进行处理。
 - 另外,v3版本中取消了`time_mix_v`参数,在计算中不再对值进行时间混合。

总体来说,v3版本对时间混合和通道混合的操作进行了简化和调整,取消了之前版本中的一些复杂性,使模型更加简洁和高效。

In [3]:
class RWKV_ChannelMix(nn.Module):
 def __init__(self, layer_id):
 super().__init__()
 self.layer_id = layer_id

 self.time_shift = nn.ZeroPad2d((0,0,1,-1))
 self.time_mix_k = nn.Parameter(torch.ones(1, 1, n_embd))
 self.time_mix_r = nn.Parameter(torch.ones(1, 1, n_embd))
 
 hidden_sz = 4 * n_embd
 self.key = nn.Linear(n_embd, hidden_sz, bias=False)
 self.receptance = nn.Linear(n_embd, n_embd, bias=False)
 self.value = nn.Linear(hidden_sz, n_embd, bias=False)

 def forward(self, x):
 xx = self.time_shift(x)
 xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)
 xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)

 k = self.key(xk)
 k = torch.square(torch.relu(k))
 kv = self.value(k)
 
 rkv = torch.sigmoid(self.receptance(xr)) * kv
 return rkv

In [4]:
class RWKV_TimeMix(nn.Module):
 def __init__(self, layer_id):
 super().__init__()
 self.layer_id = layer_id
 self.time_decay = nn.Parameter(torch.ones(n_embd, 1))
 self.time_curve = torch.tensor([-(ctx_len - 2 - i) for i in range(ctx_len-1)]).unsqueeze(0)
 self.time_first = nn.Parameter(torch.ones(n_embd, 1) * math.log(0.3))
 
 self.time_shift = nn.ZeroPad2d((0,0,1,-1))
 self.time_mix_k = nn.Parameter(torch.ones(1,1,n_embd))
 self.time_mix_v = nn.Parameter(torch.ones(1,1,n_embd))
 self.time_mix_r = nn.Parameter(torch.ones(1,1,n_embd))

 self.key = nn.Linear(n_embd, n_embd, bias=False)
 self.value = nn.Linear(n_embd, n_embd, bias=False)
 self.receptance = nn.Linear(n_embd, n_embd, bias=False)

 self.output = nn.Linear(n_embd, n_embd, bias=False)

 def forward(self, x):
 B, T, C = x.size()

 xx = self.time_shift(x)
 xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)
 xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)
 xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)

 k = self.key(xk).transpose(-1, -2)
 v = self.value(xv).transpose(-1, -2)
 r = self.receptance(xr)

 k = torch.clamp(k, max=60)
 k = torch.exp(k)

 kv = k * v

 self.time_w = torch.cat([torch.exp(self.time_decay) * self.time_curve.to(self.time_decay.device), self.time_first], dim=-1)
 w = torch.exp(self.time_w)
 
 w = w[:,-T:].unsqueeze(1)
 wkv = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(kv), w, groups=C)
 wk = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w, groups=C) + K_EPS

 rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)
 
 rwkv = self.output(rwkv)
 return rwkv

在这段代码中,`xk`、`xv` 和 `xr` 分别表示经过时间混合后的输入,用于后续计算的键、值和接收向量。在v3版本中,这三个向量是分别计算的,而在v2版本中,这三个向量是在同一个时间混合过程中计算的。

具体来说,`xk` 是通过时间混合后的输入用于计算键向量,`xv` 是用于计算值向量,`xr` 是用于计算接收向量。这种分别计算的方法可以使得模型更加灵活,能够更好地适应不同的数据特征。

分别计算键、值和接收向量的方法,类似的操作在一些相关的论文或研究中也有提及,例如在自注意力机制中,通常会分别计算键、值和查询向量。这种分别计算的方法可以提高模型的灵活性和表现能力,因此在实践中被广泛应用。

### RWKV的Block

RWKV的Block是一个基本的模块,它结合了时间混合(TimeMix)和通道混合(ChannelMix)操作。Block中的每个模块(时间混合和通道混合)都通过归一化和残差连接来处理输入数据,从而增强模型的稳定性和性能。

### 主要组件和操作

1. **LayerNorm**:用于归一化输入,增强训练的稳定性。
 - `self.ln1` 和 `self.ln2` 分别在时间混合和通道混合之前对输入进行归一化。
 
2. **时间混合(TimeMix)**:结合当前时间步和前一个时间步的信息,捕获时间依赖性。
 - `self.att = RWKV_TimeMix(layer_id)` 初始化时间混合模块。
 
3. **通道混合(ChannelMix)**:在不同通道间进行混合,增强模型的表达能力。
 - `self.ffn = RWKV_ChannelMix(layer_id)` 初始化通道混合模块。
 
4. **残差连接**:通过将混合操作的输出加回到原始输入上,保持信息流动并增强模型的梯度传播能力。

通过这种设计,RWKV的Block能够高效地处理序列数据,结合时间和通道信息,提高模型的表现。

In [5]:
class Block(nn.Module):
 def __init__(self, layer_id):
 super().__init__()
 self.layer_id = layer_id

 self.ln1 = nn.LayerNorm(n_embd)
 self.ln2 = nn.LayerNorm(n_embd)
 if self.layer_id == 0: #增加了初始的归一化
 self.ln0 = nn.LayerNorm(n_embd)
 
 self.att = RWKV_TimeMix(layer_id)
 self.ffn = RWKV_ChannelMix(layer_id)

 def forward(self, x):
 if self.layer_id == 0:
 x = self.ln0(x)
 x = x + self.att(self.ln1(x))
 x = x + self.ffn(self.ln2(x))
 return x

接下来,实现了RWKV模型的主要部分:

1. **模型加载和预处理**:代码中加载模型权重并进行时间相关权重的预处理。
2. **LayerNorm**:在`LN`方法中实现了层归一化,关于LayerNorm的使用。
3. **前馈网络(FF)和自注意力(SA)**:`FF`方法实现了前馈网络的计算,`SA`方法实现了自注意力机制的计算。这两部分对应TimeMix和ChannelMix的详细计算。
4. **运行模型**:`run`方法实现了模型的整体运行逻辑,依次通过每一层,并最终输出结果。即模型的运行和推理过程。

In [11]:
time_buf = {}

class RWKV_RNN():
 def __init__(self, MODEL_NAME=MODEL_NAME):
 print('\nloading RWKV-RNN', MODEL_NAME)
 self.ctx_len = ctx_len
 self.n_layer = n_layer
 self.n_embd = n_embd
 self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=VOCAB_NAME)

 self.w = types.SimpleNamespace()
 
 w = torch.load(MODEL_NAME + '.pth', map_location=torch.device(RUN_DEVICE))

 for x in w.keys():
 if '.time_' in x:
 w[x] = w[x].squeeze()
 if '.time_decay' in x:
 w[x] = torch.exp(-torch.exp(w[x]))
 if '.time_first' in x:
 w[x] = torch.exp(w[x])
 
 xx = x.split('.')
 here = self.w
 for i in range(len(xx)):
 if xx[i].isdigit():
 ii = int(xx[i])
 if ii not in here:
 here[ii] = types.SimpleNamespace()
 here = here[ii]
 else:
 if i == len(xx) - 1:
 setattr(here, xx[i], w[x])
 elif not hasattr(here, xx[i]):
 if xx[i+1].isdigit():
 setattr(here, xx[i], {})
 else:
 setattr(here, xx[i], types.SimpleNamespace())
 here = getattr(here, xx[i])

 self.clear()
 
 def clear(self):
 self.xx = {}
 self.aa = {}
 self.bb = {}
 def save(self, target):
 target.xx = copy.deepcopy(self.xx)
 target.aa = copy.deepcopy(self.aa)
 target.bb = copy.deepcopy(self.bb)
 def load(self, target):
 self.xx = copy.deepcopy(target.xx)
 self.aa = copy.deepcopy(target.aa)
 self.bb = copy.deepcopy(target.bb)

 def LN(self, xx, w):
 return F.layer_norm(xx, (n_embd,), weight=w.weight, bias=w.bias)

 def FF(self, xx, w, name):
 if name not in self.xx:
 self.xx[name] = torch.zeros(n_embd, device=RUN_DEVICE)
 xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)
 xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)

 self.xx[name] = xx

 r = torch.sigmoid(w.receptance.weight @ xr)
 k = torch.square(torch.relu(w.key.weight @ xk))
 kv = w.value.weight @ k

 return r * kv

 def SA(self, xx, w, name):
 if name not in self.xx:
 self.xx[name] = torch.zeros(n_embd, device=RUN_DEVICE)
 self.aa[name] = torch.zeros(n_embd, device=RUN_DEVICE)
 self.bb[name] = torch.zeros(n_embd, device=RUN_DEVICE)

 xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)
 xv = xx * w.time_mix_v + self.xx[name] * (1 - w.time_mix_v)
 xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)

 self.xx[name] = xx

 r = torch.sigmoid(w.receptance.weight @ xr)

 k = torch.exp(torch.clamp(w.key.weight @ xk, max=60))
 v = w.value.weight @ xv
 kv = k * v

 a = self.aa[name] + w.time_first * kv
 b = self.bb[name] + w.time_first * k
 self.aa[name] = w.time_decay * self.aa[name] + kv
 self.bb[name] = w.time_decay * self.bb[name] + k

 rwkv = r * a / (b + K_EPS)

 return w.output.weight @ rwkv

 def run(self, ctx):
 w = self.w
 x = w.emb.weight[ctx[-1]]

 x = self.LN(x, w.blocks[0].ln0) #相比v2版本,增加了一个初始的归一化
 for i in range(n_layer):
 x = x + self.SA(self.LN(x, w.blocks[i].ln1), w.blocks[i].att, f'att.{i}')
 x = x + self.FF(self.LN(x, w.blocks[i].ln2), w.blocks[i].ffn, f'ffn.{i}')

 x = self.LN(x, w.ln_out)

 x = w.head.weight @ x
 x = x.tolist()

 return x

In [12]:
# Edit model.py to set CPU / CUDA mode. Runs on CPU by default.

TEMPERATURE = 1.0
TOP_P = 0.7

DEBUG_DEBUG = False
LENGTH_OF_EACH = 333
NUM_TRIALS = 3

context = '\nDataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence.'

##############################################################################################################

In [13]:
model = RWKV_RNN()


loading RWKV-RNN /data1/ckw/RWKV-3-Pile-20220720-10704


下面我们从给定的输出logits中进行采样,以生成一个新的token。它实现了**温度调节采样**和**核采样(Top-p采样)**,具体步骤如下:

1. **Softmax转换**:将模型输出的logits通过softmax函数转换为概率分布。
2. **排序和累积概率计算**:对概率从高到低进行排序,并计算累积概率分布。
3. **核采样**:
 - 计算累积概率超过`top_p`的最小值,确定截断值`cutoff`。
 - 将所有低于截断值的概率置为0,从而保留最重要的`top_p`部分概率。
4. **温度调节**:如果`temperature`不为1,则调整概率分布,使得概率分布更平滑或更尖锐。
5. **采样**:从调整后的概率分布中采样一个值,返回对应的索引。

这种方法在文本生成任务中尤为常用,通过调节`temperature`和`top_p`参数,可以控制生成文本的多样性和质量。

v2和v3的采样方法是没有变化的。

In [14]:
def sample_logits(out, temperature=1.0, top_p=None):
 # 将输出转化为概率分布(通过softmax函数)
 probs = F.softmax(torch.tensor(out), dim=-1)
 
 # 按概率从高到低排序
 sorted_probs, _ = torch.sort(probs, descending=True)

 # 计算累积概率分布
 cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()
 
 # 根据累积概率和top_p计算截断值(cutoff)
 cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])
 
 # 将低于截断值的概率置为0
 probs[probs < cutoff] = 0

 # 如果temperature不等于1,则对概率进行温度调节
 if temperature != 1.0:
 probs = probs.pow(1.0 / temperature)

 # 从调整后的概率分布中采样一个值并返回
 return torch.multinomial(probs, num_samples=1)[0]


In [15]:
for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):
 ctx = [model.tokenizer.encode(context)][0]
 src_len = len(ctx)
 print(context, end='')

 model.clear()
 if TRIAL == 0: # build the RNN hidden state?
 init_state = types.SimpleNamespace()
 for i in range(src_len if DEBUG_DEBUG else src_len):
 x = ctx[:i+1]
 if i == src_len - 1:
 init_state.out = model.run(x)
 else:
 model.run(x)
 model.save(init_state)
 else:
 model.load(init_state)

 if DEBUG_DEBUG:
 out = init_state.out
 print('\n', np.array(x), '==>', np.array(
 out), np.max(out), np.min(out))

 for i in range(src_len, src_len + (0 if DEBUG_DEBUG else LENGTH_OF_EACH)):
 x = ctx[:i+1]
 x = x[-model.ctx_len:]

 if i == src_len:
 out = copy.deepcopy(init_state.out) # load the RNN hidden state
 else:
 out = model.run(x) # run the RNN

 out[0] = -999999999 # disable <|endoftext|>

 char = sample_logits(out, temperature=TEMPERATURE, top_p=TOP_P)
 char = char.item()
 print(model.tokenizer.decode(char), end='', flush=True)

 ctx += [char]
 print('\n' + '-' * 70, end='')


DataWhalechina is an organization founded at Shanghai Jiao Tong University that helps learners learn artificial intelligence. The technology focuses on learning from human behavior and not information.

We are an independent Data Whalechina team. This team of trained Data Whalechina team is available to answer any question and provide guidance.

The information provided on this site is not legal advice, and should not be construed as legal advice. You should consult a lawyer for advice regarding your specific situation.

We take no responsibility for the content, accuracy, or completeness of any information on this site or any information provided from third parties.

Cookies are used to store information about you so that we can remember and provide you with products and services you may have used. You can opt-out of our use of cookies at any time. To learn more, please read our cookie policy.

Cookies are tiny files stored on your computer that allow you to access information such a