added data preparation files

luchris429 · luchris429 · commit 7fada8a3f3cd · 2024-08-13T10:52:53.000+01:00
diff --git a/data/enwik8/prepare.py b/data/enwik8/prepare.py
@@ -0,0 +1,75 @@
+"""
+Prepare the enwik8 dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+Will save train.bin, val.bin containing the ids, and meta.pkl containing the
+encoder and decoder and some other related info.
+"""
+import os
+import pickle
+import requests
+import numpy as np
+
+# download the enwik8 dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'enwik8')
+if not os.path.exists(input_file_path):
+    data_url = 'http://mattmahoney.net/dc/enwik8.zip'
+    r = requests.get(data_url)
+    with open(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'wb') as f:
+        f.write(r.content)
+
+    # unzip the enwik8 dataset
+    import zipfile
+    with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'r') as zip_ref:
+        zip_ref.extractall(os.path.dirname(__file__))
+
+with open(input_file_path, 'r', encoding='latin-1') as f:
+    data = f.read()
+print(f"length of dataset in characters: {len(data):,}")
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = { ch:i for i,ch in enumerate(chars) }
+itos = { i:ch for i,ch in enumerate(chars) }
+def encode(s):
+    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+def decode(l):
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+
+# create the train, validation, and test splits
+n = len(data)
+num_test_chars = 5000000
+train_data = data[: -2 * num_test_chars]
+val_data = data[-2 * num_test_chars: -num_test_chars]
+test_data = data[-num_test_chars:]
+
+# encode all splits to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+test_ids = encode(test_data)
+
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+print(f"test has {len(test_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+test_ids = np.array(test_ids, dtype=np.uint16)
+
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))
+
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
+    pickle.dump(meta, f)
diff --git a/data/shakespeare_char/prepare.py b/data/shakespeare_char/prepare.py
@@ -0,0 +1,68 @@
+"""
+Prepare the Shakespeare dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+Will save train.bin, val.bin containing the ids, and meta.pkl containing the
+encoder and decoder and some other related info.
+"""
+import os
+import pickle
+import requests
+import numpy as np
+
+# download the tiny shakespeare dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
+if not os.path.exists(input_file_path):
+    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+    with open(input_file_path, 'w') as f:
+        f.write(requests.get(data_url).text)
+
+with open(input_file_path, 'r') as f:
+    data = f.read()
+print(f"length of dataset in characters: {len(data):,}")
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = { ch:i for i,ch in enumerate(chars) }
+itos = { i:ch for i,ch in enumerate(chars) }
+def encode(s):
+    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+def decode(l):
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+
+# create the train and test splits
+n = len(data)
+train_data = data[:int(n*0.9)]
+val_data = data[int(n*0.9):]
+
+# encode both to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
+    pickle.dump(meta, f)
+
+# length of dataset in characters:  1115394
+# all the unique characters:
+#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+# vocab size: 65
+# train has 1003854 tokens
+# val has 111540 tokens
diff --git a/data/shakespeare_char/readme.md b/data/shakespeare_char/readme.md
@@ -0,0 +1,9 @@
+
+# tiny shakespeare, character-level
+
+Tiny shakespeare, of the good old char-rnn fame :) Treated on character-level.
+
+After running `prepare.py`:
+
+- train.bin has 1,003,854 tokens
+- val.bin has 111,540 tokens
diff --git a/data/text8/prepare.py b/data/text8/prepare.py
@@ -0,0 +1,75 @@
+"""
+Prepare the text8 dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+Will save train.bin, val.bin containing the ids, and meta.pkl containing the
+encoder and decoder and some other related info.
+"""
+import os
+import pickle
+import requests
+import numpy as np
+
+# download the text8 dataset
+input_file_path = os.path.join(os.path.dirname(__file__), 'text8')
+if not os.path.exists(input_file_path):
+    data_url = 'http://mattmahoney.net/dc/text8.zip'
+    r = requests.get(data_url)
+    with open(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'wb') as f:
+        f.write(r.content)
+
+    # unzip the text8 dataset
+    import zipfile
+    with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'r') as zip_ref:
+        zip_ref.extractall(os.path.dirname(__file__))
+
+with open(input_file_path, 'r') as f:
+    data = f.read()
+print(f"length of dataset in characters: {len(data):,}")
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = { ch:i for i,ch in enumerate(chars) }
+itos = { i:ch for i,ch in enumerate(chars) }
+def encode(s):
+    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+def decode(l):
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+
+# create the train, validation, and test splits
+n = len(data)
+num_test_chars = 5000000
+train_data = data[: -2 * num_test_chars]
+val_data = data[-2 * num_test_chars: -num_test_chars]
+test_data = data[-num_test_chars:]
+
+# encode all splits to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+test_ids = encode(test_data)
+
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+print(f"test has {len(test_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+test_ids = np.array(test_ids, dtype=np.uint16)
+
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))
+
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
+    pickle.dump(meta, f)