karpathy Let‘s build GPT

慈云数据 7个月前 (05-09) 技术支持 56 0

1 introduction

按照karpathy的教程,一步步的完成transformer的构建,并在这个过程中,加深对transformer设计的理解。

karpathy推荐在进行网络设计的过程中,同时利用jupyter notebook进行快速测试和python进行主要的网络的构建。

在这里插入图片描述

2 网络实现

2.1 数据的构建

  • 读取text
    text = open("input.txt", "r", encoding='utf-8').read()
    words = sorted(set(''.join(text)))
    vocab_size = len(words)
    print(f'vocab_size is: {vocab_size}')
    print(''.join(words))
    print(text[:1000])
    

    vocab_size is: 65

    !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

    First Citizen:

    Before we proceed any further, hear me speak.

    All:

    Speak, speak.

    First Citizen:

    You are all resolved rather to die than to famish?

    • 将字符转换成数字
      stoi = {ch : i for i, ch in enumerate(words)}
      itos = {i : ch for i, ch in enumerate(words)}
      encode = lambda s: [stoi[ch] for ch in s]
      decode = lambda l: ''.join([itos[i] for i in l])
      print(encode("hii")) 
      print(decode(encode("hii")))
      

      [46, 47, 47]

      hii

      • 制作数据集
        import torch
        # 生成数据集
        data = torch.tensor(encode(text), dtype=torch.long)
        print(len(data))
        n = int(len(data) * 0.9)
        train_data = data[:n]
        val_data = data[n:]
        print(train_data[:1000])
        

        1115394

        tensor([18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44,

        53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63,

        1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1,

        57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49,

        • 构建dataloader
          import torch
          batch_size = 4
          torch.manual_seed(1337)
          def get_batch(split):
              datasets = {
                  'train': train_data,
                  'val': val_data,
              }[split]
              ix = torch.randint(0, len(datasets) - block_size, (batch_size,))
              x = torch.stack([datasets[i:i+block_size] for i in ix])
              y = torch.stack([datasets[1+i:i+block_size+1] for i in ix])
              return x, y
          xb, yb = get_batch('train')
          print(f'x shape is: {xb.shape}, y shape is: {yb.shape}')
          print(f'x is {xb}')
          print(f'y is {yb}')
          

          x shape is: torch.Size([4, 8]), y shape is: torch.Size([4, 8])

          x is tensor([[24, 43, 58, 5, 57, 1, 46, 43],

          [44, 53, 56, 1, 58, 46, 39, 58],

          [52, 58, 1, 58, 46, 39, 58, 1],

          [25, 17, 27, 10, 0, 21, 1, 54]])

          y is tensor([[43, 58, 5, 57, 1, 46, 43, 39],

          [53, 56, 1, 58, 46, 39, 58, 1],

          [58, 1, 58, 46, 39, 58, 1, 46],

          [17, 27, 10, 0, 21, 1, 54, 39]])

          2.2 构建pipeline

          • 定义一个最简单的网络
            import torch.nn as nn
            import torch.nn.functional as F
            torch.manual_seed(1337)
            class BigramLanguageModel(nn.Module):
                def __init__(self, vocab_size):
                    super().__init__()
                    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
                def forward(self, idx, targets=None):
                    self.out = self.token_embedding_table(idx)
                    return self.out
                
            xb, yb = get_batch('train')
            model = BigramLanguageModel(vocab_size)
            out = model(xb)
            print(f'x shape is: {xb.shape}')
            print(f'out shape is: {out.shape}')
            

            x shape is: torch.Size([4, 8])

            out shape is: torch.Size([4, 8, 65])

            • 包含输出以后的完整的pipeline是
              from typing import Iterator
              import torch.nn as nn
              import torch.nn.functional as F
              torch.manual_seed(1337)
              class BigramLanguageModel(nn.Module):
                  def __init__(self, vocab_size):
                      super().__init__()
                      self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
                  def forward(self, idx, targets=None):
                      logits = self.token_embedding_table(idx)  # B, T, C
                      if targets is None:
                          loss = None
                      else:
                          B, T, C = logits.shape
                          logits = logits.view(B*T, C) # 这是很好理解的
                          targets = targets.view(B*T) # 但是targets是B,T
                          loss = F.cross_entropy(logits, targets)
                      return logits, loss
                  def generate(self, idx, max_new_tokens):
                      for _ in range(max_new_tokens):
                          logits, loss = self(idx)    
                          logits = logits[:, -1, :]  # B, C    
                          prob = F.softmax(logits, dim=-1)  # 对最后一维进行softmax
                          ix = torch.multinomial(prob, num_samples=1) # B, C
                          print(idx)
                          idx = torch.cat((idx, ix), dim=1)   # B,T+1
                          print(idx)
                      return idx
                          # ix = ix.view(B)
                  
              xb, yb = get_batch('train')
              model = BigramLanguageModel(vocab_size)
              out, loss = model(xb)
              print(f'x shape is: {xb.shape}')
              print(f'out shape is: {out.shape}')
              idx = idx = torch.zeros((1, 1), dtype=torch.long)
              print(decode(model.generate(idx, max_new_tokens=10)[0].tolist()))
              # print(f'idx is {idx}')
              

              x shape is: torch.Size([4, 8])

              out shape is: torch.Size([4, 8, 65])

              tensor([[0]])

              tensor([[ 0, 50]])

              tensor([[ 0, 50]])

              tensor([[ 0, 50, 7]])

              tensor([[ 0, 50, 7]])

              tensor([[ 0, 50, 7, 29]])

              tensor([[ 0, 50, 7, 29]])

              tensor([[ 0, 50, 7, 29, 37]])

              tensor([[ 0, 50, 7, 29, 37]])

              tensor([[ 0, 50, 7, 29, 37, 48]])

              tensor([[ 0, 50, 7, 29, 37, 48]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58, 5]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58, 5]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15, 24]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15, 24]])

              tensor([[ 0, 50, 7, 29, 37, 48, 58, 5, 15, 24, 12]])

              l-QYjt’CL?

              这里有几个地方需要注意,首先输入输出是:

              x is tensor([[24, 43, 58, 5, 57, 1, 46, 43],

              [44, 53, 56, 1, 58, 46, 39, 58],

              [52, 58, 1, 58, 46, 39, 58, 1],

              [25, 17, 27, 10, 0, 21, 1, 54]])

              y is tensor([[43, 58, 5, 57, 1, 46, 43, 39],

              [53, 56, 1, 58, 46, 39, 58, 1],

              [58, 1, 58, 46, 39, 58, 1, 46],

              [17, 27, 10, 0, 21, 1, 54, 39]])

              并且这个pipeline,网络对输入的长度也没有限制

              • 开始训练

                这个时候我们需要构建一个完整的训练代码,如果还是用jupyter notebook,每次改变了网络的一个组成部分,需要重新执行很多地方,比较麻烦,所以构建一个.py文件

                import torch
                import torch.nn as nn
                import torch.nn.functional as F
                # hyperparameters
                batch_size = 32
                block_size = 8
                max_iter = 3000
                eval_interval = 300
                learning_rate = 1e-2
                device = 'cuda' if torch.cuda.is_available() else 'cpu'
                eval_iters = 200
                # ---------------------
                torch.manual_seed(1337)
                text = open("input.txt", "r", encoding='utf-8').read()
                chars = sorted(list(set(text)))
                vocab_size = len(chars)
                stoi = {ch : i for i, ch in enumerate(chars)}
                itos = {i : ch for i, ch in enumerate(chars)}
                encode = lambda s: [stoi[ch] for ch in s]
                decode = lambda l: ''.join([itos[i] for i in l])
                # 生成数据集
                data = torch.tensor(encode(text), dtype=torch.long)
                n = int(len(data) * 0.9)
                train_data = data[:n]
                val_data = data[n:]
                def get_batch(split):
                    datasets = {
                        'train': train_data,
                        'val': val_data,
                    }[split]
                    ix = torch.randint(0, len(datasets) - block_size, (batch_size,))
                    x = torch.stack([datasets[i:i+block_size] for i in ix])
                    y = torch.stack([datasets[1+i:i+block_size+1] for i in ix])
                    x, y = x.to(device), y.to(device)
                    return x, y
                @torch.no_grad()
                def estimate_loss():
                    out = {}
                    model.eval()
                    for split in ['train', 'val']:
                        losses = torch.zeros(eval_iters)
                        for k in range(eval_iters):
                            X, Y = get_batch(split)
                            logits, loss = model(X, Y)
                            losses[k] = loss.item()
                        out[split] = losses.mean()
                    model.train()
                    return out
                class BigramLanguageModel(nn.Module):
                    def __init__(self, vocab_size):
                        super().__init__()
                        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
                    def forward(self, idx, targets=None):
                        # import pdb; pdb.set_trace()
                        logits = self.token_embedding_table(idx)  # B, T, C
                        if targets is None:
                            loss = None
                        else:
                            B, T, C = logits.shape
                            logits = logits.view(B*T, C) # 这是很好理解的
                            targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
                            loss = F.cross_entropy(logits, targets)
                        return logits, loss
                    def generate(self, idx, max_new_tokens):
                        for _ in range(max_new_tokens):
                            logits, loss = self(idx)    
                            logits = logits[:, -1, :]  # B, C    
                            prob = F.softmax(logits, dim=-1)  # 对最后一维进行softmax
                            ix = torch.multinomial(prob, num_samples=1) # B, 1
                            # print(idx)
                            idx = torch.cat((idx, ix), dim=1)   # B,T+1
                            # print(idx)
                        return idx
                model = BigramLanguageModel(vocab_size)
                m = model.to(device)
                optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
                lossi = []
                for iter in range(max_iter):
                    if iter % eval_interval == 0:
                        losses = estimate_loss()
                        print(f'step {iter}: train loss {losses["train"]:.4f}, val loss {losses["val"]:.4f}')
                    xb, yb = get_batch('train')
                    out, loss = m(xb, yb)
                    optimizer.zero_grad(set_to_none=True)
                    loss.backward()
                    optimizer.step()
                # generate from the model
                context = torch.zeros((1,1), dtype=torch.long, device=device)
                print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
                

                输出的结果

                step 0: train loss 4.7305, val loss 4.7241

                step 300: train loss 2.8110, val loss 2.8249

                step 600: train loss 2.5434, val loss 2.5682

                step 900: train loss 2.4932, val loss 2.5088

                step 1200: train loss 2.4863, val loss 2.5035

                step 1500: train loss 2.4665, val loss 2.4921

                step 1800: train loss 2.4683, val loss 2.4936

                step 2100: train loss 2.4696, val loss 2.4846

                step 2400: train loss 2.4638, val loss 2.4879

                step 2700: train loss 2.4738, val loss 2.4911

                CEThik brid owindakis b, bth

                HAPet bobe d e.

                S:

                O:3 my d?

                LUCous:

                Wanthar u qur, t.

                War dXENDoate awice my.

                Hastarom oroup

                Yowhthetof isth ble mil ndill, ath iree sengmin lat Heriliovets, and Win nghir.

                Swanousel lind me l.

                HAshe ce hiry:

                Supr aisspllw y.

                Hentofu n Boopetelaves

                MPOLI s, d mothakleo Windo whth eisbyo the m dourive we higend t so mower; te

                AN ad nterupt f s ar igr t m:

                Thin maleronth,

                Mad

                RD:

                WISo myrangoube!

                KENob&y, wardsal thes ghesthinin couk ay aney IOUSts I&fr y ce.

                J

                2.3 self-attention

                我们处理当前的字符的时候,需要和历史字符进行通信,历史字符可以看成是某一种特征,使用最简单的均值提取的方式提取历史字符的feature

                # 最简单的通信方式,将当前的字符和之前的字符平均进行沟通
                # 可以看成是history information的features
                a = torch.tril(torch.ones(3, 3))
                print(a)
                a = torch.tril(a) / torch.sum(a, 1, keepdim=True)
                print(a)
                

                tensor([[1., 0., 0.],

                [1., 1., 0.],

                [1., 1., 1.]])

                tensor([[1.0000, 0.0000, 0.0000],

                [0.5000, 0.5000, 0.0000],

                [0.3333, 0.3333, 0.3333]])

                可以采用softmax的方式进行mask

                import torch.nn.functional as F
                tril = torch.tril(torch.ones(T, T))  # 某种意义上的Q
                wei = torch.zeros(T, T) # K
                wei = wei.masked_fill(tril == 0, float('-inf'))  
                print(wei)
                wei = F.softmax(wei)
                print(wei)
                

                tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],

                [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],

                [0., 0., 0., -inf, -inf, -inf, -inf, -inf],

                [0., 0., 0., 0., -inf, -inf, -inf, -inf],

                [0., 0., 0., 0., 0., -inf, -inf, -inf],

                [0., 0., 0., 0., 0., 0., -inf, -inf],

                [0., 0., 0., 0., 0., 0., 0., -inf],

                [0., 0., 0., 0., 0., 0., 0., 0.]])

                tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],

                [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],

                [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],

                [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],

                [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],

                [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],

                [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],

                [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

                特征提取的结果

                xbow2 = wei @ x # (T, T) @ (B, T, C) --> (B, T, C)  # x对应v
                print(xbow2.shape)
                

                torch.Size([4, 8, 2])

                加上pos_emb现在的forward版本

                def forward(self, idx, targets=None):
                        # import pdb; pdb.set_trace()
                        tok_emb = self.token_embedding_table(idx)  # B, T, C(n_emb)
                        pos_emb = self.position_embedding_table(torch.range(T, device=device)) # T,C 
                        # positional encoding
                        x = tok_emb + pos_emb   # (B, T, C) broadcasting
                        logits = self.lm_head(x)  # B, T, C(vocab_size)
                        if targets is None:
                            loss = None
                        else:
                            B, T, C = logits.shape
                            logits = logits.view(B*T, C) # 这是很好理解的
                            targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
                            loss = F.cross_entropy(logits, targets)
                        return logits, loss
                

                karpathy 给出的一些启示

                • Attention is a communication mechanism. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
                • There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
                • Each example across batch dimension is of course processed completely independently and never “talk” to each other
                • In an “encoder” attention block just delete the single line that does masking with tril, allowing all tokens to communicate. This block here is called a “decoder” attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
                • “self-attention” just means that the keys and values are produced from the same source as queries. In “cross-attention”, the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
                • “Scaled” attention additional divides wei by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

                  attention的公式其中scale是为了保证两个分布相乘的时候,方差不变的。

                  在这里插入图片描述

                  k = torch.randn(B, T, head_size)
                  q = torch.randn(B, T, head_size)
                  wei = q @ k.transpose(-2, -1)
                  wei_scale = wei / head_size**0.5
                  print(k.var())
                  print(q.var())
                  print(wei.var())
                  print(wei_scale.var())
                  

                  输出结果

                  tensor(1.0278)

                  tensor(0.9802)

                  tensor(15.9041)

                  tensor(0.9940)

                  初始化对结果的影响很大,实际上来说我们还是很希望softmax初始化的结果是一个方差较小的分布,如果不进行scale

                  torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]) * 8, dim=-1)
                  

                  tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

                  对原来的py文件做一些修改:

                  在这里插入图片描述

                  class Head(nn.Module):
                      def __init__(self, head_size):
                          super().__init__()
                          self.query = nn.Linear(n_embd, head_size, bias=False)
                          self.key = nn.Linear(n_embd, head_size, bias=False)
                          self.value = nn.Linear(n_embd, head_size, bias=False)
                          self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
                      def forward(self, x):
                          # import pdb; pdb.set_trace()
                          B, T, C = x.shape    
                          q = self.query(x)      #(B, T, C)
                          k = self.key(x)        #(B, T, C)
                          v = self.value(x)      #(B, T, C)
                          wei = q @ k.transpose(-2, -1) * C**-0.5 # (B,T,C)@(B,C,T) --> (B, T, T)
                          wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
                          wei = F.softmax(wei, dim=-1)   # (B, T, T)
                          out = wei @ v   #(B, T, T) @ (B, T, C) --> (B, T, C)
                          return out
                  

                  修改模型

                  class BigramLanguageModel(nn.Module):
                      def __init__(self, vocab_size):
                          super().__init__()
                          self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
                          self.position_embedding_table = nn.Embedding(block_size, n_embd)
                          self.sa_head = Head(n_embd)  # head的尺寸保持不变
                          self.lm_head = nn.Linear(n_embd, vocab_size)
                      def forward(self, idx, targets=None):
                          # import pdb; pdb.set_trace()
                          B, T = idx.shape
                          tok_emb = self.token_embedding_table(idx)  # B, T, C(n_emb)
                          pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # T,C 
                          # positional encoding
                          x = tok_emb + pos_emb   # (B, T, C) broadcasting
                          x = self.sa_head(x)
                          logits = self.lm_head(x)  # B, T, C(vocab_size)
                          if targets is None:
                              loss = None
                          else:
                              B, T, C = logits.shape
                              logits = logits.view(B*T, C) # 这是很好理解的
                              targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
                              loss = F.cross_entropy(logits, targets)
                          return logits, loss
                      def generate(self, idx, max_new_tokens):
                          for _ in range(max_new_tokens):
                              idx_cmd = idx[:, -block_size:]   # (B, T)
                              logits, loss = self(idx_cmd)    
                              logits = logits[:, -1, :]  # B, C    
                              prob = F.softmax(logits, dim=-1)  # 对最后一维进行softmax
                              ix = torch.multinomial(prob, num_samples=1) # B, 1
                              # print(idx)
                              idx = torch.cat((idx, ix), dim=1)   # B,T+1
                              # print(idx)
                          return idx
                  

                  加上self-attention的结果

                  step 4500: train loss 2.3976, val loss 2.4041

                  2.4 multi-head attention

                  在这里插入图片描述

                  这里借鉴了group convolutional 的思想,

                  class MultiHeadAttention(nn.Module):
                      """ multiple head of self attention in parallel """
                      def __init__(self, num_heads, head_size):
                          super().__init__()
                          self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
                      def forward(self, x):
                          return torch.cat([h(x) for h in self.heads], dim=-1)
                  

                  应用的时候

                  self.sa_head = MultiHeadAttention(4, n_embd//4)  # head的尺寸保持不变
                  

                  训练的结果

                  step 4500: train loss 2.2679, val loss 2.2789

                  2.5 feedforward network

                  在这里插入图片描述

                  在这里插入图片描述

                  加上feedforward的结果

                  step 4500: train loss 2.2337, val loss 2.2476

                  同时用一个block表示这个这个单元,

                  一个transform的block可以理解成一个connection 组成部分+computation组成部分

                  class Block(nn.Module):
                      def __init__(self, n_embd, n_head):
                          super().__init__()
                          head_size = n_embd // n_head
                          self.sa = MultiHeadAttention(n_head, head_size)
                          self.ffwd = FeedForward(n_embd)
                      
                      def forward(self, x):
                          x = self.sa(x)
                          x = self.ffwd(x)
                          return x
                  

                  修改模型的定义

                  class BigramLanguageModel(nn.Module):
                      def __init__(self, vocab_size):
                          super().__init__()
                          self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
                          self.position_embedding_table = nn.Embedding(block_size, n_embd)
                          self.blocks = nn.Sequential(
                              Block(n_embd, n_head=4),
                              Block(n_embd, n_head=4),
                              Block(n_embd, n_head=4),
                          )
                          self.lm_head = nn.Linear(n_embd, vocab_size)
                      def forward(self, idx, targets=None):
                          # import pdb; pdb.set_trace()
                          B, T = idx.shape
                          tok_emb = self.token_embedding_table(idx)  # B, T, C(n_emb)
                          pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # T,C 
                          # positional encoding
                          x = tok_emb + pos_emb   # (B, T, C) broadcasting
                          x = self.blocks(x)
                          logits = self.lm_head(x)  # B, T, C(vocab_size)
                          if targets is None:
                              loss = None
                          else:
                              B, T, C = logits.shape
                              logits = logits.view(B*T, C) # 这是很好理解的
                              targets = targets.view(B*T) # 但是targets是B,T, C其实并不好理解
                              loss = F.cross_entropy(logits, targets)
                          return logits, loss
                  

                  2.6 Residual network

                  现在模型的深度已经很深了,直接训练很可能无法很好的收敛,需要另外一个很重要的工具,残差网络。

                  class Block(nn.Module):
                      def __init__(self, n_embd, n_head):
                          super().__init__()
                          head_size = n_embd // n_head
                          self.sa = MultiHeadAttention(n_head, head_size)
                          self.ffwd = FeedForward(n_embd)
                      
                      def forward(self, x):
                          x = x + self.sa(x)
                          x = x + self.ffwd(x)
                          return x
                  

                  深度扩充了以后,很容易过拟合了

                  step 4500: train loss 2.0031, val loss 2.1067

                  2.7 Layer normalization

                  我们先来看一下很基础的batchnorm。加入x,y 是两个独立,并且均值为0,方差为1的分布。

                  根据Var(xy)=E(X)^2 * Var(Y) + E(Y)^2 * Var(X) + Var(X) * Var(Y)=1

                  再来看矩阵相乘后,每一行变成了T2个独立同分布的乘积,根据中心极限定理:它们的和将近似服从正态分布,均值为各随机变量均值之和,方差为各随机变量方差之和。

                  也就是说矩阵相乘后的第一列的var=T2*1, mean=0

                  所以在矩阵相乘的时候,进行scale T 2 \sqrt{T2} T2 ​ 可以normalize(var 是平方,所以用了开根号)

                  在这里插入图片描述

                  x = torch.ones(5,5)
                  x = torch.tril(x)
                  print(x)
                  print(x.mean(dim=0))
                  print(x.mean(dim=1))
                  

                  观察一下矩阵normalize的特点

                  tensor([[1., 0., 0., 0., 0.],

                  [1., 1., 0., 0., 0.],

                  [1., 1., 1., 0., 0.],

                  [1., 1., 1., 1., 0.],

                  [1., 1., 1., 1., 1.]])

                  tensor([1.0000, 0.8000, 0.6000, 0.4000, 0.2000])

                  tensor([0.2000, 0.4000, 0.6000, 0.8000, 1.0000])

                  class BatchNorm1d:
                    
                    def __init__(self, dim, eps=1e-5, momentum=0.1):
                      self.eps = eps
                      self.momentum = momentum
                      self.training = True
                      # parameters (trained with backprop)
                      self.gamma = torch.ones(dim)
                      self.beta = torch.zeros(dim)
                      # buffers (trained with a running 'momentum update')
                      self.running_mean = torch.zeros(dim)
                      self.running_var = torch.ones(dim)
                    
                    def __call__(self, x):
                      # calculate the forward pass
                      if self.training:
                        if x.ndim == 2:
                          dim = 0
                        elif x.ndim == 3:
                          dim = (0,1)
                        xmean = x.mean(dim, keepdim=True) # batch mean
                        xvar = x.var(dim, keepdim=True) # batch variance
                      else:
                        xmean = self.running_mean
                        xvar = self.running_var
                      xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
                      self.out = self.gamma * xhat + self.beta
                      # update the buffers
                      if self.training:
                        with torch.no_grad():
                          self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                          self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
                      return self.out
                    
                    def parameters(self):
                      return [self.gamma, self.beta]
                  

                  BatchNormalized对一列进行normalized, layernormalize 对一行进行normalized。

                  class BatchNorm1d:
                    
                    def __init__(self, dim, eps=1e-5, momentum=0.1):
                      self.eps = eps
                      self.momentum = momentum
                      self.training = True
                      # parameters (trained with backprop)
                      self.gamma = torch.ones(dim)
                      self.beta = torch.zeros(dim)
                      # buffers (trained with a running 'momentum update')
                      self.running_mean = torch.zeros(dim)
                      self.running_var = torch.ones(dim)
                    
                    def __call__(self, x):
                      # calculate the forward pass
                      if self.training:
                        dim = 1
                        xmean = x.mean(dim, keepdim=True) # batch mean
                        xvar = x.var(dim, keepdim=True) # batch variance
                      else:
                        xmean = self.running_mean
                        xvar = self.running_var
                      xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
                      self.out = self.gamma * xhat + self.beta
                      # update the buffers
                      if self.training:
                        with torch.no_grad():
                          self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                          self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
                      return self.out
                    
                    def parameters(self):
                      return [self.gamma, self.beta]
                  

                  如今用的比较 的模式

                  在这里插入图片描述

                  对应的代码

                  ``python

                  class Block(nn.Module):

                  def init(self, n_embd, n_head):

                  super().init()

                  head_size = n_embd // n_head

                  self.sa = MultiHeadAttention(n_head, head_size)

                  self.ffwd = FeedForward(n_embd)

                  self.ln1 = nn.LayerNorm(n_embd)

                  self.ln2 = nn.LayerNorm(n_embd)

                  def forward(self, x):
                      x = x + self.sa(self.ln1(x))
                      x = x + self.ffwd(self.ln2(x))
                      return x
                  
                  并且一般会在连续的decoder block 模块后添加一个layerNorm
                  ```python
                  class BigramLanguageModel(nn.Module):
                      def __init__(self, vocab_size):
                          super().__init__()
                          self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
                          self.position_embedding_table = nn.Embedding(block_size, n_embd)
                          self.blocks = nn.Sequential(
                              Block(n_embd, n_head=4),
                              Block(n_embd, n_head=4),
                              Block(n_embd, n_head=4),
                              nn.LayerNorm(n_embd),
                          )
                          self.lm_head = nn.Linear(n_embd, vocab_size)
                  

                  加上layerNormlization以后,精度又上升了一些

                  step 4500: train loss 1.9931, val loss 2.0892

                  现在训练误差和验证误差的loss比较大 ,需要想办法解决一下。

                  2.8 使用dropout

                  • 在head 使用dropout,防止模型被特定的feature给过分影响了提高模型的鲁棒性。
                        def forward(self, x):
                            # import pdb; pdb.set_trace()
                            B, T, C = x.shape    
                            q = self.query(x)      #(B, T, C)
                            k = self.key(x)        #(B, T, C)
                            v = self.value(x)      #(B, T, C)
                            wei = q @ k.transpose(-2, -1) * C**-0.5 # (B,T,C)@(B,C,T) --> (B, T, T)
                            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
                            wei = F.softmax(wei, dim=-1)   # (B, T, T)
                            wei = self.dropout(wei)
                            out = wei @ v   #(B, T, T) @ (B, T, C) --> (B, T, C)
                            return out
                    
                    • 在multihead上使用dropout,也是同样的原理,防止特定feature过分影响了模型
                          def forward(self, x):
                              out =  torch.cat([h(x) for h in self.heads], dim=-1)
                              out = self.dropout(self.proj(out))
                              return out
                      
                      • 在计算单元的输出结果前使用dropout
                        class FeedForward(nn.Module):
                            def __init__(self, n_embd):
                                super().__init__()
                                self.net = nn.Sequential(
                                    nn.Linear(n_embd, 4 * n_embd),
                                    nn.ReLU(),
                                    nn.Linear(4 * n_embd, n_embd),
                                    nn.Dropout(dropout),
                                )
                        

                        修改设定参数

                        # hyperparameters
                        batch_size = 64
                        block_size = 256
                        max_iter = 5000
                        eval_interval = 500
                        learning_rate = 3e-4    # self-attention can't tolerate very high learnning rate
                        device = 'cuda' if torch.cuda.is_available() else 'cpu'
                        eval_iters = 200
                        n_embd = 384
                        n_layer = 6
                        n_head = 6
                        dropout = 0.2
                        

                        step 4500: train loss 1.1112, val loss 1.4791

                        References

                        [1] https://www.youtube.com/watch?v=kCc8FmEb1nY

微信扫一扫加客服

微信扫一扫加客服

点击启动AI问答
Draggable Icon