Skip to content

Commit ffc944a

Browse files
committed
refactor example codes
1 parent e24a25a commit ffc944a

File tree

5 files changed

+147
-31
lines changed

5 files changed

+147
-31
lines changed

cusim/proto/config.proto

+91-12
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,89 @@
1-
// Copyright (c) 2020 Jisang Yoon
1+
// Copyright (c) 2021 Jisang Yoon
22
// All rights reserved.
33
//
44
// This source code is licensed under the Apache 2.0 license found in the
55
// LICENSE file in the root directory of this source tree.
66

77
syntax = "proto2";
88

9+
10+
// option for data preprocessing
911
message IoUtilsConfigProto {
12+
// logging levels in python and C++
1013
optional int32 py_log_level = 1 [default = 2];
1114
optional int32 c_log_level = 2 [default = 2];
15+
16+
// number of chunk lines to preprocess (txt => hdf5 format) data
1217
optional int32 chunk_lines = 3 [default = 100000];
18+
19+
// number of concurrent threads in data preprocessing
1320
optional int32 num_threads = 4 [default = 4];
21+
22+
// convert charater to lower case if true
1423
optional bool lower = 5 [default = true];
1524
}
1625

26+
27+
// option for LDA model
1728
message CuLDAConfigProto {
29+
// logging levels in python and C++
30+
optional int32 py_log_level = 1 [default = 2];
31+
optional int32 c_log_level = 2 [default = 2];
32+
33+
// raw data path (format from https://archive.ics.uci.edu/ml/datasets/bag+of+words)
1834
optional string data_path = 7;
19-
required string processed_data_path = 6;
35+
36+
// preprocessed data path (hdf5 format)
37+
// if empty, make temporary directory
38+
optional string processed_data_path = 6;
39+
40+
// vocabulary path
2041
required string keys_path = 16;
42+
43+
// skip preprocess (there should be already preprocessed hdf5 format) if true
2144
optional bool skip_preprocess = 8;
45+
46+
// path to store gamma in E step
47+
// if empty, make temporary directory
2248
optional string gamma_path = 17;
23-
optional bool reuse_gamma = 18;
2449

25-
optional int32 py_log_level = 1 [default = 2];
26-
optional int32 c_log_level = 2 [default = 2];
50+
// reuse gamma from previous epoch if true
51+
// if false, initiate gamma as Figure 6 in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
52+
optional bool reuse_gamma = 18;
2753

54+
// number of topics
2855
optional int32 num_topics = 3 [default = 10];
56+
57+
// block dimension in CUDA
58+
// should be multiple of WARP_SIZE (=32)
2959
optional int32 block_dim = 4 [default = 32];
60+
61+
// set the number blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
3062
optional int32 hyper_threads = 5 [default = 100];
31-
optional int32 word_min_count = 9 [default = 5];
63+
64+
// batch size in training
3265
optional int32 batch_size = 10 [default = 1000000];
66+
67+
// number of epochs in training
3368
optional int32 epochs = 11 [default = 10];
69+
70+
// number of iterations in each E step
3471
optional int32 num_iters_in_e_step = 12 [default = 5];
72+
73+
// validation ratio, should be between 0 and 1
3574
optional double vali_p = 13 [default = 0.2];
75+
76+
// random seed
3677
optional int32 seed = 14 [default = 777];
78+
79+
// remove all tempory directorys generated by package when program finnished if true
3780
optional bool remove_tmp = 19 [default = true];
81+
3882
optional IoUtilsConfigProto io = 15;
3983
}
4084

41-
85+
// options for loading pretrained w2v model
86+
// can load w2v model file generated by gensim or original w2v code by Google
4287
message W2VPretrainedModel {
4388
optional string filename = 1;
4489
optional bool no_header = 2;
@@ -47,32 +92,66 @@ message W2VPretrainedModel {
4792
}
4893

4994

95+
// option for training Word2Vec model
5096
message CuW2VConfigProto {
51-
optional string data_path = 7;
52-
required string processed_data_dir = 6;
53-
optional bool skip_preprocess = 8;
54-
97+
// logging levels in python and C++
5598
optional int32 py_log_level = 1 [default = 2];
5699
optional int32 c_log_level = 2 [default = 2];
57100

101+
// raw data path (stream txt format)
102+
optional string data_path = 7;
103+
104+
// path to save preprocessed data (hdf5 format)
105+
optional string processed_data_dir = 6;
106+
107+
// skip data preprocessing (therefore, there should be
108+
// already preprocessed hdf5 format file) if true
109+
optional bool skip_preprocess = 8;
110+
111+
// number of embedding dimensions
58112
optional int32 num_dims = 3 [default = 50];
113+
114+
// block_dim in CUDA
59115
optional int32 block_dim = 4 [default = 32];
116+
117+
// set number of blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
60118
optional int32 hyper_threads = 5 [default = 100];
119+
120+
// generate vocabulary with words appreared in corpus at least word_min_count times
61121
optional int32 word_min_count = 9 [default = 5];
122+
123+
// batch size and number of epochs in training
62124
optional int32 batch_size = 10 [default = 1000000];
63125
optional int32 epochs = 11 [default = 10];
64126

65127
// seed fields
66128
optional int32 seed = 14 [default = 777];
129+
130+
// random table size in negative sampling
67131
optional int32 random_size = 12 [default = 100000000];
68132

133+
// number of negative samples
134+
// if zero, it uses hierarchical softmax
69135
optional int32 neg = 17 [default = 10];
70-
// as recommended in w2v paper
136+
137+
// weight in negative sampling will be word_count ** count_power for each word
138+
// default value 0.75 is recommended in w2v paper
71139
optional double count_power = 18 [default = 0.75];
140+
141+
// if true, train skip gram model, else train cbow model
72142
optional bool skip_gram = 19 [default = true];
143+
144+
// if true, use average context vector in cbow model
145+
// else use summation of context vectors
73146
optional bool cbow_mean = 20 [default = true];
147+
148+
// learning rate
74149
optional double lr = 21 [default = 0.001];
150+
151+
// window size in both skip gram and cbow model
75152
optional int32 window_size = 22 [default = 5];
153+
154+
// remove all tempory directorys generated by package when program finnished if true
76155
optional bool remove_tmp = 26 [default = true];
77156

78157
optional IoUtilsConfigProto io = 24;

examples/example_lda.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def run_cusim():
7272
"num_topics": 50,
7373
"num_iters_in_e_step": 10,
7474
"reuse_gamma": True,
75-
"skip_preprocess": os.path.exists(processed_data_path),
75+
# "skip_preprocess": os.path.exists(processed_data_path),
7676
}
7777
start = time.time()
7878
lda = CuLDA(opt)

examples/example_w2v.py

+51-17
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import tqdm
1414
import fire
15+
import pandas as pd
1516

1617
import gensim
1718
from gensim import downloader as api
@@ -28,10 +29,19 @@
2829
DATASET = "quora-duplicate-questions"
2930
DATA_PATH = f"./res/{DATASET}.stream.txt"
3031
PROCESSED_DATA_DIR = "./res/{DATASET}-processed"
31-
MIN_COUNT = 5
3232
CUSIM_MODEL = "./res/cusim.w2v.model"
3333
GENSIM_MODEL = "./res/gensim.w2v.model"
3434

35+
36+
# common hyperparameters
37+
MIN_COUNT = 5
38+
LEARNING_RATE = 0.001
39+
NEG_SIZE = 10
40+
NUM_DIMS = 100
41+
CBOW_MEAN = False
42+
EPOCHS = 10
43+
44+
3545
def download():
3646
if os.path.exists(DATA_PATH):
3747
LOGGER.info("%s already exists", DATA_PATH)
@@ -68,46 +78,70 @@ def preprocess_line(line, tokenizer, lemmatizer):
6878
line = [lemmatizer.lemmatize(token) for token in line]
6979
return " ".join(line)
7080

71-
def run_cusim():
81+
def run_cusim(skip_gram=False, hierarchical_softmax=False):
7282
download()
7383
opt = {
7484
"data_path": DATA_PATH,
7585
"processed_data_dir": PROCESSED_DATA_DIR,
76-
"num_dims": 100,
86+
# "skip_preprocess": os.path.exists(PROCESSED_DATA_DIR),
87+
"num_dims": NUM_DIMS,
88+
"epochs": EPOCHS,
89+
"word_min_count": MIN_COUNT,
90+
"lr": 0.001,
7791
"io": {
7892
"lower": False
7993
},
80-
"neg": 0,
81-
"skip_gram": False,
82-
"cbow_mean": False,
94+
"neg": 0 if hierarchical_softmax else NEG_SIZE,
95+
"skip_gram": skip_gram,
96+
"cbow_mean": CBOW_MEAN,
8397
}
8498
start = time.time()
8599
w2v = CuW2V(opt)
86100
w2v.train_model()
87-
LOGGER.info("elapsed for cusim w2v training: %.4e sec", time.time() - start)
101+
elapsed = time.time() - start
102+
LOGGER.info("elapsed for cusim w2v training: %.4e sec", elapsed)
88103
w2v.save_word2vec_format(CUSIM_MODEL, binary=False)
89-
evaluate_w2v_model(CUSIM_MODEL)
104+
return elapsed, evaluate_w2v_model(CUSIM_MODEL)
90105

91-
def run_gensim():
106+
def run_gensim(skip_gram=False, hierarchical_softmax=False, workers=8):
92107
download()
93108
start = time.time()
94-
model = gensim.models.Word2Vec(corpus_file=DATA_PATH, min_alpha=0.001,
95-
min_count=5, sg=False, hs=True, workers=4,
96-
alpha=0.001, negative=10, iter=10,
97-
cbow_mean=False)
98-
LOGGER.info("elapsed for gensim w2v training: %.4e sec", time.time() - start)
109+
model = gensim.models.Word2Vec(corpus_file=DATA_PATH, workers=workers,
110+
sg=skip_gram, hs=hierarchical_softmax,
111+
min_alpha=LEARNING_RATE, min_count=MIN_COUNT,
112+
alpha=LEARNING_RATE, negative=NEG_SIZE,
113+
iter=EPOCHS, cbow_mean=CBOW_MEAN,
114+
size=NUM_DIMS)
115+
elapsed = time.time() - start
116+
LOGGER.info("elapsed for gensim w2v training: %.4e sec", elapsed)
99117
model.wv.save_word2vec_format(GENSIM_MODEL, binary=False)
100118
LOGGER.info("gensim w2v model is saved to %s", GENSIM_MODEL)
101-
evaluate_w2v_model(GENSIM_MODEL)
119+
return elapsed, evaluate_w2v_model(GENSIM_MODEL)
102120

103121
def evaluate_w2v_model(model=GENSIM_MODEL):
104122
LOGGER.info("load word2vec format model from %s", model)
105123
model = gensim.models.KeyedVectors.load_word2vec_format(model)
106124
results = model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"),
107125
case_insensitive=False)
108126
LOGGER.info("evaluation results: %s", results)
109-
110-
127+
return results
128+
129+
def run_experiments(sg0=False, hs0=False):
130+
training_time = {"attr": "training_time"}
131+
pearson = {"attr": "pearson"}
132+
spearman = {"attr": "spearman"}
133+
for i in [1, 2, 4, 8]:
134+
elapsed, evals = run_gensim(sg0, hs0, i)
135+
training_time[f"{i} workers"] = elapsed
136+
pearson[f"{i} workers"] = evals[0][0]
137+
spearman[f"{i} workers"] = evals[1][0]
138+
elapsed, evals = run_cusim(sg0, hs0)
139+
training_time["GPU"] = elapsed
140+
pearson["GPU"] = evals[0][0]
141+
spearman["GPU"] = evals[1][0]
142+
df0 = pd.DataFrame([training_time, pearson, spearman])
143+
df0.set_index("attr", inplace=True)
144+
print(df0.to_markdown())
111145

112146

113147
if __name__ == "__main__":

examples/requirements.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
fire
2-
gensim
2+
gensim==3.8.3
33
nltk
44
tqdm
55
wget
6+
pandas
7+
tabulate

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
h5py
22
jsmin
33
numpy
4+
scipy
45
pybind11
56
protobuf==3.10.0
67
grpcio-tools==1.27.1

0 commit comments

Comments
 (0)