fork download
  1. import os, math, random, time
  2.  
  3. random.seed(42)
  4.  
  5. # Data
  6. if not os.path.exists("input.txt"):
  7. import urllib.request
  8. urllib.request.urlretrieve("https://r...content-available-to-author-only...t.com/karpathy/makemore/refs/heads/master/names.txt", "input.txt")
  9. docs = [l.strip() for l in open("input.txt").read().strip().split("\n") if l.strip()]
  10. random.shuffle(docs)
  11. uchars = sorted(set("".join(docs)))
  12. BOS = len(uchars)
  13. vocab_size = len(uchars) + 1
  14.  
  15. # Vector helpers
  16. vadd = lambda a, b: [ai + bi for ai, bi in zip(a, b)]
  17. vdot = lambda a, b: sum(ai * bi for ai, bi in zip(a, b))
  18. vscale = lambda a, s: [ai * s for ai in a]
  19. vaccum = lambda dst, src: [dst.__setitem__(j, dst[j] + src[j]) for j in range(len(src))]
  20.  
  21. # Config & params
  22. n_embd, n_head, n_layer, block_size = 16, 4, 1, 8
  23. head_dim = n_embd // n_head
  24. mat = lambda r, c, s=0.02: [[random.gauss(0, s) for _ in range(c)] for _ in range(r)]
  25. dlike = lambda d: {k: [[0.0]*len(r) for r in m] for k, m in d.items()}
  26. state_dict = {"wte": mat(vocab_size, n_embd), "wpe": mat(block_size, n_embd), "lm_head": mat(vocab_size, n_embd)}
  27. for i in range(n_layer):
  28. state_dict |= {f"layer{i}.attn_wq": mat(n_embd, n_embd), f"layer{i}.attn_wk": mat(n_embd, n_embd),
  29. f"layer{i}.attn_wv": mat(n_embd, n_embd), f"layer{i}.attn_wo": mat(n_embd, n_embd, 0),
  30. f"layer{i}.mlp_fc1": mat(4 * n_embd, n_embd), f"layer{i}.mlp_fc2": mat(n_embd, 4 * n_embd, 0)}
  31.  
  32. # Ops
  33. def linear(x, w): return [vdot(wr, x) for wr in w]
  34.  
  35. def softmax(z):
  36. mx = max(z); e = [math.exp(v - mx) for v in z]; s = sum(e)
  37. return vscale(e, 1 / s)
  38.  
  39. def rmsnorm(x):
  40. inv = (sum(v*v for v in x) / len(x) + 1e-5) ** -0.5
  41. return vscale(x, inv), inv
  42.  
  43. def rmsnorm_b(dy, y, inv):
  44. s = sum(dy[j] * y[j] for j in range(len(y)))
  45. return [inv * (dy[j] - y[j] * s / len(y)) for j in range(len(y))]
  46.  
  47. def linear_b(dy, w, x, dw):
  48. for j in range(len(dy)):
  49. for k in range(len(x)):
  50. dw[j][k] += dy[j] * x[k]
  51. return [sum(dy[j] * w[j][k] for j in range(len(dy))) for k in range(len(x))]
  52.  
  53. def softmax_b(dp, p):
  54. s = sum(di * pi for di, pi in zip(dp, p))
  55. return [p[i] * (dp[i] - s) for i in range(len(p))]
  56.  
  57. # Forward pass (single position, shared by training and inference)
  58. def forward_pos(tok, pos, keys, values, save=False):
  59. x0, inv0 = rmsnorm(vadd(state_dict['wte'][tok], state_dict['wpe'][pos]))
  60. x, layers = x0, []
  61. for li in range(n_layer):
  62. xn, ainv = rmsnorm(x)
  63. q = linear(xn, state_dict[f'layer{li}.attn_wq'])
  64. keys[li].append(linear(xn, state_dict[f'layer{li}.attn_wk']))
  65. values[li].append(linear(xn, state_dict[f'layer{li}.attn_wv']))
  66. x_attn, attn_weights, n_ctx = [0.0] * n_embd, [], len(keys[li])
  67. for h in range(n_head):
  68. hs = h * head_dim
  69. q_h = q[hs:hs + head_dim]
  70. k_h = [keys[li][t][hs:hs + head_dim] for t in range(n_ctx)]
  71. v_h = [values[li][t][hs:hs + head_dim] for t in range(n_ctx)]
  72. attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(n_ctx)]
  73. attn_w = softmax(attn_logits)
  74. attn_weights.append(attn_w)
  75. for j in range(head_dim):
  76. x_attn[hs + j] = sum(attn_w[t] * v_h[t][j] for t in range(n_ctx))
  77. x = vadd(linear(x_attn, state_dict[f'layer{li}.attn_wo']), x)
  78. x_norm, norm_inv = rmsnorm(x)
  79. mlp_h1 = linear(x_norm, state_dict[f'layer{li}.mlp_fc1'])
  80. mlp_h1_act = [max(0.0, v) ** 2 for v in mlp_h1]
  81. x = vadd(linear(mlp_h1_act, state_dict[f'layer{li}.mlp_fc2']), x)
  82. if save:
  83. layers.append((xn, ainv, q, x_attn, attn_weights, x_norm, norm_inv, mlp_h1, mlp_h1_act))
  84. return (linear(x, state_dict['lm_head']), (tok, x0, inv0, layers, x)) if save else linear(x, state_dict['lm_head'])
  85.  
  86. # Train (Adam)
  87. learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8
  88. num_steps = 5000
  89. m_state, v_state = dlike(state_dict), dlike(state_dict)
  90.  
  91. t0 = time.time()
  92. for step in range(num_steps):
  93. doc = docs[step % len(docs)]
  94. tokens = [BOS] + [uchars.index(c) for c in doc] + [BOS]
  95. n = min(block_size, len(tokens) - 1)
  96.  
  97. # Forward
  98. keys, values, saved = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)], []
  99. loss = 0.0
  100. for pos in range(n):
  101. logits, state = forward_pos(tokens[pos], pos, keys, values, save=True)
  102. probs = softmax(logits)
  103. loss -= math.log(probs[tokens[pos + 1]]) / n
  104. saved.append((*state, probs))
  105.  
  106. # Backward
  107. dstate = dlike(state_dict)
  108. dkeys = [[[0.0] * n_embd for _ in range(n)] for _ in range(n_layer)]
  109. dvalues = [[[0.0] * n_embd for _ in range(n)] for _ in range(n_layer)]
  110. for pos in range(n - 1, -1, -1):
  111. tok, x0, inv0, layers, xf, probs = saved[pos]
  112. target_id = tokens[pos + 1]
  113. dx = linear_b([(probs[j] - (j == target_id)) / n for j in range(vocab_size)], state_dict['lm_head'], xf, dstate['lm_head'])
  114. for li in range(n_layer - 1, -1, -1):
  115. xn, ainv, q, x_attn, attn_weights, x_norm, norm_inv, mlp_h1, mlp_h1_act = layers[li]
  116. dmlp_h1_act = linear_b(dx, state_dict[f'layer{li}.mlp_fc2'], mlp_h1_act, dstate[f'layer{li}.mlp_fc2'])
  117. dxn = linear_b([dmlp_h1_act[j] * 2 * mlp_h1[j] if mlp_h1[j] > 0 else 0 for j in range(4 * n_embd)],
  118. state_dict[f'layer{li}.mlp_fc1'], x_norm, dstate[f'layer{li}.mlp_fc1'])
  119. dx = vadd(rmsnorm_b(dxn, x_norm, norm_inv), dx)
  120. dx_attn = linear_b(dx, state_dict[f'layer{li}.attn_wo'], x_attn, dstate[f'layer{li}.attn_wo'])
  121. dq = [0.0] * n_embd
  122. for h in range(n_head):
  123. hs = h * head_dim
  124. q_h, attn_w = q[hs:hs + head_dim], attn_weights[h]
  125. k_h = [keys[li][t][hs:hs + head_dim] for t in range(pos + 1)]
  126. v_h = [values[li][t][hs:hs + head_dim] for t in range(pos + 1)]
  127. dattn = softmax_b([sum(dx_attn[hs + j] * v_h[t][j] for j in range(head_dim)) for t in range(pos + 1)], attn_w)
  128. for t in range(pos + 1):
  129. c = dattn[t] / head_dim**0.5
  130. for j in range(head_dim):
  131. dvalues[li][t][hs + j] += dx_attn[hs + j] * attn_w[t]
  132. dq[hs + j] += c * k_h[t][j]
  133. dkeys[li][t][hs + j] += c * q_h[j]
  134. dxn = linear_b(dq, state_dict[f'layer{li}.attn_wq'], xn, dstate[f'layer{li}.attn_wq'])
  135. dxn = vadd(dxn, linear_b(dkeys[li][pos], state_dict[f'layer{li}.attn_wk'], xn, dstate[f'layer{li}.attn_wk']))
  136. dxn = vadd(dxn, linear_b(dvalues[li][pos], state_dict[f'layer{li}.attn_wv'], xn, dstate[f'layer{li}.attn_wv']))
  137. dx = vadd(rmsnorm_b(dxn, xn, ainv), dx)
  138. demb = rmsnorm_b(dx, x0, inv0)
  139. vaccum(dstate['wte'][tok], demb)
  140. vaccum(dstate['wpe'][pos], demb)
  141.  
  142. # Adam
  143. lr_t = learning_rate * 0.5 * (1 + math.cos(math.pi * step / num_steps))
  144. m_hat_corr, v_hat_corr = 1 - beta1**(step+1), 1 - beta2**(step+1)
  145. for k in state_dict:
  146. for i, row in enumerate(state_dict[k]):
  147. for j in range(len(row)):
  148. g = dstate[k][i][j]
  149. m_state[k][i][j] = beta1 * m_state[k][i][j] + (1 - beta1) * g
  150. v_state[k][i][j] = beta2 * v_state[k][i][j] + (1 - beta2) * g**2
  151. state_dict[k][i][j] -= lr_t * (m_state[k][i][j] / m_hat_corr) / ((v_state[k][i][j] / v_hat_corr)**0.5 + eps_adam)
  152. print(f"step {step+1}/{num_steps} loss: {loss:.4f}")
  153.  
  154. print(f"\nTotal training time: {time.time() - t0:.2f}s")
  155.  
  156. # Inference
  157. temperature = 2.0
  158. for sample_idx in range(20):
  159. keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
  160. token_id, sample = BOS, []
  161. for pos in range(block_size):
  162. probs = softmax(vscale(forward_pos(token_id, pos, keys, values), temperature))
  163. token_id = random.choices(range(vocab_size), weights=probs)[0]
  164. if token_id == BOS:
  165. break
  166. sample.append(uchars[token_id])
  167. print(f"{sample_idx+1}: {''.join(sample)}")
  168. # your code goes here
Runtime error #stdin #stdout #stderr 0.98s 39392KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Traceback (most recent call last):
  File "/usr/lib/python3.12/urllib/request.py", line 1344, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
  File "/usr/lib/python3.12/http/client.py", line 1336, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/usr/lib/python3.12/http/client.py", line 1382, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "/usr/lib/python3.12/http/client.py", line 1331, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "/usr/lib/python3.12/http/client.py", line 1091, in _send_output
    self.send(msg)
  File "/usr/lib/python3.12/http/client.py", line 1035, in send
    self.connect()
  File "/usr/lib/python3.12/http/client.py", line 1470, in connect
    super().connect()
  File "/usr/lib/python3.12/http/client.py", line 1001, in connect
    self.sock = self._create_connection(
                ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 828, in create_connection
    for res in getaddrinfo(host, port, 0, SOCK_STREAM):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 963, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno -3] Temporary failure in name resolution

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "./prog.py", line 8, in <module>
  File "/usr/lib/python3.12/urllib/request.py", line 240, in urlretrieve
    with contextlib.closing(urlopen(url, data)) as fp:
                            ^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/urllib/request.py", line 215, in urlopen
    return opener.open(url, data, timeout)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/urllib/request.py", line 515, in open
    response = self._open(req, data)
               ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/urllib/request.py", line 532, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/urllib/request.py", line 492, in _call_chain
    result = func(*args)
             ^^^^^^^^^^^
  File "/usr/lib/python3.12/urllib/request.py", line 1392, in https_open
    return self.do_open(http.client.HTTPSConnection, req,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/urllib/request.py", line 1347, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -3] Temporary failure in name resolution>