refactor example codes

js1010 · js1010 · commit ffc944acfa9d · 2021-02-15T23:37:38.000+09:00
diff --git a/cusim/proto/config.proto b/cusim/proto/config.proto
@@ -1,44 +1,89 @@
-// Copyright (c) 2020 Jisang Yoon
+// Copyright (c) 2021 Jisang Yoon
 // All rights reserved.
 //
 // This source code is licensed under the Apache 2.0 license found in the
 // LICENSE file in the root directory of this source tree.
 
 syntax = "proto2";
 
+
+// option for data preprocessing
 message IoUtilsConfigProto {
+  // logging levels in python and C++
   optional int32 py_log_level = 1 [default = 2];
   optional int32 c_log_level = 2 [default = 2];
+
+  // number of chunk lines to preprocess (txt => hdf5 format) data
   optional int32 chunk_lines = 3 [default = 100000];
+
+  // number of concurrent threads in data preprocessing
   optional int32 num_threads = 4 [default = 4];
+
+  // convert charater to lower case if true
   optional bool lower = 5 [default = true];
 }
 
+
+// option for LDA model
 message CuLDAConfigProto {
+  // logging levels in python and C++
+  optional int32 py_log_level = 1 [default = 2];
+  optional int32 c_log_level = 2 [default = 2];
+
+  // raw data path (format from https://archive.ics.uci.edu/ml/datasets/bag+of+words)
   optional string data_path = 7;
-  required string processed_data_path = 6;
+
+  // preprocessed data path (hdf5 format)
+  // if empty, make temporary directory
+  optional string processed_data_path = 6;
+
+  // vocabulary path
   required string keys_path = 16;
+
+  // skip preprocess (there should be already preprocessed hdf5 format) if true
   optional bool skip_preprocess = 8;
+
+  // path to store gamma in E step
+  // if empty, make temporary directory
   optional string gamma_path = 17;
-  optional bool reuse_gamma = 18;
 
-  optional int32 py_log_level = 1 [default = 2];
-  optional int32 c_log_level = 2 [default = 2];
+  // reuse gamma from previous epoch if true
+  // if false, initiate gamma as Figure 6 in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
+  optional bool reuse_gamma = 18;
 
+  // number of topics
   optional int32 num_topics = 3 [default = 10];
+
+  // block dimension in CUDA
+  // should be multiple of WARP_SIZE (=32)
   optional int32 block_dim = 4 [default = 32];
+
+  // set the number blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
   optional int32 hyper_threads = 5 [default = 100];
-  optional int32 word_min_count = 9 [default = 5];
+
+  // batch size in training
   optional int32 batch_size = 10 [default = 1000000];
+
+  // number of epochs in training
   optional int32 epochs = 11 [default = 10];
+
+  // number of iterations in each E step
   optional int32 num_iters_in_e_step = 12 [default = 5];
+
+  // validation ratio, should be between 0 and 1
   optional double vali_p = 13 [default = 0.2];
+
+  // random seed
   optional int32 seed = 14 [default = 777];
+
+  // remove all tempory directorys generated by package when program finnished if true
   optional bool remove_tmp = 19 [default = true];
+
   optional IoUtilsConfigProto io = 15;
 }
 
-
+// options for loading pretrained w2v model
+// can load w2v model file generated by gensim or original w2v code by Google
 message W2VPretrainedModel {
   optional string filename = 1;
   optional bool no_header = 2;
@@ -47,32 +92,66 @@ message W2VPretrainedModel {
 }
 
 
+// option for training Word2Vec model
 message CuW2VConfigProto {
-  optional string data_path = 7;
-  required string processed_data_dir = 6;
-  optional bool skip_preprocess = 8;
-
+  // logging levels in python and C++
   optional int32 py_log_level = 1 [default = 2];
   optional int32 c_log_level = 2 [default = 2];
 
+  // raw data path (stream txt format)
+  optional string data_path = 7;
+
+  // path to save preprocessed data (hdf5 format)
+  optional string processed_data_dir = 6;
+
+  // skip data preprocessing (therefore, there should be
+  // already preprocessed hdf5 format file) if true
+  optional bool skip_preprocess = 8;
+
+  // number of embedding dimensions
   optional int32 num_dims = 3 [default = 50];
+
+  // block_dim in CUDA
   optional int32 block_dim = 4 [default = 32];
+
+  // set number of blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
   optional int32 hyper_threads = 5 [default = 100];
+
+  // generate vocabulary with words appreared in corpus at least word_min_count times
   optional int32 word_min_count = 9 [default = 5];
+
+  // batch size and number of epochs in training
   optional int32 batch_size = 10 [default = 1000000];
   optional int32 epochs = 11 [default = 10];
 
   // seed fields
   optional int32 seed = 14 [default = 777];
+
+  // random table size in negative sampling
   optional int32 random_size = 12 [default = 100000000];
 
+  // number of negative samples
+  // if zero, it uses hierarchical softmax
   optional int32 neg = 17 [default = 10];
-  // as recommended in w2v paper
+
+  // weight in negative sampling will be word_count ** count_power for each word
+  // default value 0.75 is recommended in w2v paper
   optional double count_power = 18 [default = 0.75];
+
+  // if true, train skip gram model, else train cbow model
   optional bool skip_gram = 19 [default = true];
+
+  // if true, use average context vector in cbow model
+  // else use summation of context vectors
   optional bool cbow_mean = 20 [default = true];
+
+  // learning rate
   optional double lr = 21 [default = 0.001];
+
+  // window size in both skip gram and cbow model
   optional int32 window_size = 22 [default = 5];
+
+  // remove all tempory directorys generated by package when program finnished if true
   optional bool remove_tmp = 26 [default = true];
 
   optional IoUtilsConfigProto io = 24;
diff --git a/examples/example_lda.py b/examples/example_lda.py
@@ -72,7 +72,7 @@ def run_cusim():
     "num_topics": 50,
     "num_iters_in_e_step": 10,
     "reuse_gamma": True,
-    "skip_preprocess": os.path.exists(processed_data_path),
+    # "skip_preprocess": os.path.exists(processed_data_path),
   }
   start = time.time()
   lda = CuLDA(opt)
diff --git a/examples/example_w2v.py b/examples/example_w2v.py
@@ -12,6 +12,7 @@
 
 import tqdm
 import fire
+import pandas as pd
 
 import gensim
 from gensim import downloader as api
@@ -28,10 +29,19 @@
 DATASET = "quora-duplicate-questions"
 DATA_PATH = f"./res/{DATASET}.stream.txt"
 PROCESSED_DATA_DIR = "./res/{DATASET}-processed"
-MIN_COUNT = 5
 CUSIM_MODEL = "./res/cusim.w2v.model"
 GENSIM_MODEL = "./res/gensim.w2v.model"
 
+
+# common hyperparameters
+MIN_COUNT = 5
+LEARNING_RATE = 0.001
+NEG_SIZE = 10
+NUM_DIMS = 100
+CBOW_MEAN = False
+EPOCHS = 10
+
+
 def download():
   if os.path.exists(DATA_PATH):
     LOGGER.info("%s already exists", DATA_PATH)
@@ -68,46 +78,70 @@ def preprocess_line(line, tokenizer, lemmatizer):
   line = [lemmatizer.lemmatize(token) for token in line]
   return " ".join(line)
 
-def run_cusim():
+def run_cusim(skip_gram=False, hierarchical_softmax=False):
   download()
   opt = {
     "data_path": DATA_PATH,
     "processed_data_dir": PROCESSED_DATA_DIR,
-    "num_dims": 100,
+    # "skip_preprocess": os.path.exists(PROCESSED_DATA_DIR),
+    "num_dims": NUM_DIMS,
+    "epochs": EPOCHS,
+    "word_min_count": MIN_COUNT,
+    "lr": 0.001,
     "io": {
       "lower": False
     },
-    "neg": 0,
-    "skip_gram": False,
-    "cbow_mean": False,
+    "neg": 0 if hierarchical_softmax else NEG_SIZE,
+    "skip_gram": skip_gram,
+    "cbow_mean": CBOW_MEAN,
   }
   start = time.time()
   w2v = CuW2V(opt)
   w2v.train_model()
-  LOGGER.info("elapsed for cusim w2v training: %.4e sec", time.time() - start)
+  elapsed = time.time() - start
+  LOGGER.info("elapsed for cusim w2v training: %.4e sec", elapsed)
   w2v.save_word2vec_format(CUSIM_MODEL, binary=False)
-  evaluate_w2v_model(CUSIM_MODEL)
+  return elapsed, evaluate_w2v_model(CUSIM_MODEL)
 
-def run_gensim():
+def run_gensim(skip_gram=False, hierarchical_softmax=False, workers=8):
   download()
   start = time.time()
-  model = gensim.models.Word2Vec(corpus_file=DATA_PATH, min_alpha=0.001,
-                                 min_count=5, sg=False, hs=True, workers=4,
-                                 alpha=0.001, negative=10, iter=10,
-                                 cbow_mean=False)
-  LOGGER.info("elapsed for gensim w2v training: %.4e sec", time.time() - start)
+  model = gensim.models.Word2Vec(corpus_file=DATA_PATH, workers=workers,
+                                 sg=skip_gram, hs=hierarchical_softmax,
+                                 min_alpha=LEARNING_RATE, min_count=MIN_COUNT,
+                                 alpha=LEARNING_RATE, negative=NEG_SIZE,
+                                 iter=EPOCHS, cbow_mean=CBOW_MEAN,
+                                 size=NUM_DIMS)
+  elapsed = time.time() - start
+  LOGGER.info("elapsed for gensim w2v training: %.4e sec", elapsed)
   model.wv.save_word2vec_format(GENSIM_MODEL, binary=False)
   LOGGER.info("gensim w2v model is saved to %s", GENSIM_MODEL)
-  evaluate_w2v_model(GENSIM_MODEL)
+  return elapsed, evaluate_w2v_model(GENSIM_MODEL)
 
 def evaluate_w2v_model(model=GENSIM_MODEL):
   LOGGER.info("load word2vec format model from %s", model)
   model = gensim.models.KeyedVectors.load_word2vec_format(model)
   results = model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"),
                                          case_insensitive=False)
   LOGGER.info("evaluation results: %s", results)
-
-
+  return results
+
+def run_experiments(sg0=False, hs0=False):
+  training_time = {"attr": "training_time"}
+  pearson = {"attr": "pearson"}
+  spearman = {"attr": "spearman"}
+  for i in [1, 2, 4, 8]:
+    elapsed, evals = run_gensim(sg0, hs0, i)
+    training_time[f"{i} workers"] = elapsed
+    pearson[f"{i} workers"] = evals[0][0]
+    spearman[f"{i} workers"] = evals[1][0]
+  elapsed, evals = run_cusim(sg0, hs0)
+  training_time["GPU"] = elapsed
+  pearson["GPU"] = evals[0][0]
+  spearman["GPU"] = evals[1][0]
+  df0 = pd.DataFrame([training_time, pearson, spearman])
+  df0.set_index("attr", inplace=True)
+  print(df0.to_markdown())
 
 
 if __name__ == "__main__":
diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -1,5 +1,7 @@
 fire
-gensim
+gensim==3.8.3
 nltk
 tqdm
 wget
+pandas
+tabulate
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 h5py
 jsmin
 numpy
+scipy
 pybind11
 protobuf==3.10.0
 grpcio-tools==1.27.1

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ def run_cusim():`
`72`	`72`	`"num_topics": 50,`
`73`	`73`	`"num_iters_in_e_step": 10,`
`74`	`74`	`"reuse_gamma": True,`
`75`		`- "skip_preprocess": os.path.exists(processed_data_path),`
	`75`	`+ # "skip_preprocess": os.path.exists(processed_data_path),`
`76`	`76`	`}`
`77`	`77`	`start = time.time()`
`78`	`78`	`lda = CuLDA(opt)`