Skip to content

Commit 7fada8a

Browse files
committed
added data preparation files
1 parent 7b6a0a3 commit 7fada8a

File tree

4 files changed

+227
-0
lines changed

4 files changed

+227
-0
lines changed

data/enwik8/prepare.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""
2+
Prepare the enwik8 dataset for character-level language modeling.
3+
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
4+
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
5+
encoder and decoder and some other related info.
6+
"""
7+
import os
8+
import pickle
9+
import requests
10+
import numpy as np
11+
12+
# download the enwik8 dataset
13+
input_file_path = os.path.join(os.path.dirname(__file__), 'enwik8')
14+
if not os.path.exists(input_file_path):
15+
data_url = 'http://mattmahoney.net/dc/enwik8.zip'
16+
r = requests.get(data_url)
17+
with open(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'wb') as f:
18+
f.write(r.content)
19+
20+
# unzip the enwik8 dataset
21+
import zipfile
22+
with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'enwik8.zip'), 'r') as zip_ref:
23+
zip_ref.extractall(os.path.dirname(__file__))
24+
25+
with open(input_file_path, 'r', encoding='latin-1') as f:
26+
data = f.read()
27+
print(f"length of dataset in characters: {len(data):,}")
28+
29+
# get all the unique characters that occur in this text
30+
chars = sorted(list(set(data)))
31+
vocab_size = len(chars)
32+
print("all the unique characters:", ''.join(chars))
33+
print(f"vocab size: {vocab_size:,}")
34+
35+
# create a mapping from characters to integers
36+
stoi = { ch:i for i,ch in enumerate(chars) }
37+
itos = { i:ch for i,ch in enumerate(chars) }
38+
def encode(s):
39+
return [stoi[c] for c in s] # encoder: take a string, output a list of integers
40+
def decode(l):
41+
return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
42+
43+
# create the train, validation, and test splits
44+
n = len(data)
45+
num_test_chars = 5000000
46+
train_data = data[: -2 * num_test_chars]
47+
val_data = data[-2 * num_test_chars: -num_test_chars]
48+
test_data = data[-num_test_chars:]
49+
50+
# encode all splits to integers
51+
train_ids = encode(train_data)
52+
val_ids = encode(val_data)
53+
test_ids = encode(test_data)
54+
55+
print(f"train has {len(train_ids):,} tokens")
56+
print(f"val has {len(val_ids):,} tokens")
57+
print(f"test has {len(test_ids):,} tokens")
58+
59+
# export to bin files
60+
train_ids = np.array(train_ids, dtype=np.uint16)
61+
val_ids = np.array(val_ids, dtype=np.uint16)
62+
test_ids = np.array(test_ids, dtype=np.uint16)
63+
64+
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
65+
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
66+
test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))
67+
68+
# save the meta information as well, to help us encode/decode later
69+
meta = {
70+
'vocab_size': vocab_size,
71+
'itos': itos,
72+
'stoi': stoi,
73+
}
74+
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
75+
pickle.dump(meta, f)

data/shakespeare_char/prepare.py

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
Prepare the Shakespeare dataset for character-level language modeling.
3+
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
4+
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
5+
encoder and decoder and some other related info.
6+
"""
7+
import os
8+
import pickle
9+
import requests
10+
import numpy as np
11+
12+
# download the tiny shakespeare dataset
13+
input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
14+
if not os.path.exists(input_file_path):
15+
data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
16+
with open(input_file_path, 'w') as f:
17+
f.write(requests.get(data_url).text)
18+
19+
with open(input_file_path, 'r') as f:
20+
data = f.read()
21+
print(f"length of dataset in characters: {len(data):,}")
22+
23+
# get all the unique characters that occur in this text
24+
chars = sorted(list(set(data)))
25+
vocab_size = len(chars)
26+
print("all the unique characters:", ''.join(chars))
27+
print(f"vocab size: {vocab_size:,}")
28+
29+
# create a mapping from characters to integers
30+
stoi = { ch:i for i,ch in enumerate(chars) }
31+
itos = { i:ch for i,ch in enumerate(chars) }
32+
def encode(s):
33+
return [stoi[c] for c in s] # encoder: take a string, output a list of integers
34+
def decode(l):
35+
return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
36+
37+
# create the train and test splits
38+
n = len(data)
39+
train_data = data[:int(n*0.9)]
40+
val_data = data[int(n*0.9):]
41+
42+
# encode both to integers
43+
train_ids = encode(train_data)
44+
val_ids = encode(val_data)
45+
print(f"train has {len(train_ids):,} tokens")
46+
print(f"val has {len(val_ids):,} tokens")
47+
48+
# export to bin files
49+
train_ids = np.array(train_ids, dtype=np.uint16)
50+
val_ids = np.array(val_ids, dtype=np.uint16)
51+
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
52+
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
53+
54+
# save the meta information as well, to help us encode/decode later
55+
meta = {
56+
'vocab_size': vocab_size,
57+
'itos': itos,
58+
'stoi': stoi,
59+
}
60+
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
61+
pickle.dump(meta, f)
62+
63+
# length of dataset in characters: 1115394
64+
# all the unique characters:
65+
# !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
66+
# vocab size: 65
67+
# train has 1003854 tokens
68+
# val has 111540 tokens

data/shakespeare_char/readme.md

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
# tiny shakespeare, character-level
3+
4+
Tiny shakespeare, of the good old char-rnn fame :) Treated on character-level.
5+
6+
After running `prepare.py`:
7+
8+
- train.bin has 1,003,854 tokens
9+
- val.bin has 111,540 tokens

data/text8/prepare.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""
2+
Prepare the text8 dataset for character-level language modeling.
3+
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
4+
Will save train.bin, val.bin containing the ids, and meta.pkl containing the
5+
encoder and decoder and some other related info.
6+
"""
7+
import os
8+
import pickle
9+
import requests
10+
import numpy as np
11+
12+
# download the text8 dataset
13+
input_file_path = os.path.join(os.path.dirname(__file__), 'text8')
14+
if not os.path.exists(input_file_path):
15+
data_url = 'http://mattmahoney.net/dc/text8.zip'
16+
r = requests.get(data_url)
17+
with open(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'wb') as f:
18+
f.write(r.content)
19+
20+
# unzip the text8 dataset
21+
import zipfile
22+
with zipfile.ZipFile(os.path.join(os.path.dirname(__file__), 'text8.zip'), 'r') as zip_ref:
23+
zip_ref.extractall(os.path.dirname(__file__))
24+
25+
with open(input_file_path, 'r') as f:
26+
data = f.read()
27+
print(f"length of dataset in characters: {len(data):,}")
28+
29+
# get all the unique characters that occur in this text
30+
chars = sorted(list(set(data)))
31+
vocab_size = len(chars)
32+
print("all the unique characters:", ''.join(chars))
33+
print(f"vocab size: {vocab_size:,}")
34+
35+
# create a mapping from characters to integers
36+
stoi = { ch:i for i,ch in enumerate(chars) }
37+
itos = { i:ch for i,ch in enumerate(chars) }
38+
def encode(s):
39+
return [stoi[c] for c in s] # encoder: take a string, output a list of integers
40+
def decode(l):
41+
return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
42+
43+
# create the train, validation, and test splits
44+
n = len(data)
45+
num_test_chars = 5000000
46+
train_data = data[: -2 * num_test_chars]
47+
val_data = data[-2 * num_test_chars: -num_test_chars]
48+
test_data = data[-num_test_chars:]
49+
50+
# encode all splits to integers
51+
train_ids = encode(train_data)
52+
val_ids = encode(val_data)
53+
test_ids = encode(test_data)
54+
55+
print(f"train has {len(train_ids):,} tokens")
56+
print(f"val has {len(val_ids):,} tokens")
57+
print(f"test has {len(test_ids):,} tokens")
58+
59+
# export to bin files
60+
train_ids = np.array(train_ids, dtype=np.uint16)
61+
val_ids = np.array(val_ids, dtype=np.uint16)
62+
test_ids = np.array(test_ids, dtype=np.uint16)
63+
64+
train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
65+
val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
66+
test_ids.tofile(os.path.join(os.path.dirname(__file__), 'test.bin'))
67+
68+
# save the meta information as well, to help us encode/decode later
69+
meta = {
70+
'vocab_size': vocab_size,
71+
'itos': itos,
72+
'stoi': stoi,
73+
}
74+
with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
75+
pickle.dump(meta, f)

0 commit comments

Comments
 (0)