nanoGPT
model.py
- 10-16导入的包1 
 2
 3
 4
 5
 6
 7import math
 import inspect
 from dataclasses import dataclass
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
class LayerNorm(nn.Module)
- 这里可以设置是否带bias,封装了pytorch的F.layer_norm1 
 2
 3
 4
 5
 6
 7
 8
 9
 10class LayerNorm(nn.Module):
 """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
 def __init__(self, ndim, bias):
 super().__init__()
 self.weight = nn.Parameter(torch.ones(ndim))
 self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
 def forward(self, input):
 return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
class CausalSelfAttention(nn.Module)
- 因果自注意力机制1 
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50class CausalSelfAttention(nn.Module):
 def __init__(self, config):
 super().__init__()
 #必须要整除
 assert config.n_embd % config.n_head == 0
 # key, query, value projections for all heads, but in a batch
 self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
 # output projection
 self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
 # regularization
 self.attn_dropout = nn.Dropout(config.dropout)
 self.resid_dropout = nn.Dropout(config.dropout)
 self.n_head = config.n_head
 self.n_embd = config.n_embd
 self.dropout = config.dropout
 # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
 self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
 if not self.flash:
 print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
 # causal mask to ensure that attention is only applied to the left in the input sequence
 self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
 .view(1, 1, config.block_size, config.block_size))
 def forward(self, x):
 B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
 # calculate query, key, values for all heads in batch and move head forward to be the batch dim
 # 保证最后一个维度是n_embd,dim=-1
 q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
 k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
 q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
 v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
 # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
 if self.flash:
 # efficient attention using Flash Attention CUDA kernels
 y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
 else:
 # manual implementation of attention
 att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
 att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
 att = F.softmax(att, dim=-1)
 att = self.attn_dropout(att)
 y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
 y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
 # output projection
 y = self.resid_dropout(self.c_proj(y))
 return y
nanoGPT
      http://sjx.com/2025/02/18/nanoGPT-model-py/