1
- // Copyright (c) 2020 Jisang Yoon
1
+ // Copyright (c) 2021 Jisang Yoon
2
2
// All rights reserved.
3
3
//
4
4
// This source code is licensed under the Apache 2.0 license found in the
5
5
// LICENSE file in the root directory of this source tree.
6
6
7
7
syntax = "proto2" ;
8
8
9
+
10
+ // option for data preprocessing
9
11
message IoUtilsConfigProto {
12
+ // logging levels in python and C++
10
13
optional int32 py_log_level = 1 [default = 2 ];
11
14
optional int32 c_log_level = 2 [default = 2 ];
15
+
16
+ // number of chunk lines to preprocess (txt => hdf5 format) data
12
17
optional int32 chunk_lines = 3 [default = 100000 ];
18
+
19
+ // number of concurrent threads in data preprocessing
13
20
optional int32 num_threads = 4 [default = 4 ];
21
+
22
+ // convert charater to lower case if true
14
23
optional bool lower = 5 [default = true ];
15
24
}
16
25
26
+
27
+ // option for LDA model
17
28
message CuLDAConfigProto {
29
+ // logging levels in python and C++
30
+ optional int32 py_log_level = 1 [default = 2 ];
31
+ optional int32 c_log_level = 2 [default = 2 ];
32
+
33
+ // raw data path (format from https://archive.ics.uci.edu/ml/datasets/bag+of+words)
18
34
optional string data_path = 7 ;
19
- required string processed_data_path = 6 ;
35
+
36
+ // preprocessed data path (hdf5 format)
37
+ // if empty, make temporary directory
38
+ optional string processed_data_path = 6 ;
39
+
40
+ // vocabulary path
20
41
required string keys_path = 16 ;
42
+
43
+ // skip preprocess (there should be already preprocessed hdf5 format) if true
21
44
optional bool skip_preprocess = 8 ;
45
+
46
+ // path to store gamma in E step
47
+ // if empty, make temporary directory
22
48
optional string gamma_path = 17 ;
23
- optional bool reuse_gamma = 18 ;
24
49
25
- optional int32 py_log_level = 1 [default = 2 ];
26
- optional int32 c_log_level = 2 [default = 2 ];
50
+ // reuse gamma from previous epoch if true
51
+ // if false, initiate gamma as Figure 6 in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
52
+ optional bool reuse_gamma = 18 ;
27
53
54
+ // number of topics
28
55
optional int32 num_topics = 3 [default = 10 ];
56
+
57
+ // block dimension in CUDA
58
+ // should be multiple of WARP_SIZE (=32)
29
59
optional int32 block_dim = 4 [default = 32 ];
60
+
61
+ // set the number blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
30
62
optional int32 hyper_threads = 5 [default = 100 ];
31
- optional int32 word_min_count = 9 [default = 5 ];
63
+
64
+ // batch size in training
32
65
optional int32 batch_size = 10 [default = 1000000 ];
66
+
67
+ // number of epochs in training
33
68
optional int32 epochs = 11 [default = 10 ];
69
+
70
+ // number of iterations in each E step
34
71
optional int32 num_iters_in_e_step = 12 [default = 5 ];
72
+
73
+ // validation ratio, should be between 0 and 1
35
74
optional double vali_p = 13 [default = 0.2 ];
75
+
76
+ // random seed
36
77
optional int32 seed = 14 [default = 777 ];
78
+
79
+ // remove all tempory directorys generated by package when program finnished if true
37
80
optional bool remove_tmp = 19 [default = true ];
81
+
38
82
optional IoUtilsConfigProto io = 15 ;
39
83
}
40
84
41
-
85
+ // options for loading pretrained w2v model
86
+ // can load w2v model file generated by gensim or original w2v code by Google
42
87
message W2VPretrainedModel {
43
88
optional string filename = 1 ;
44
89
optional bool no_header = 2 ;
@@ -47,32 +92,66 @@ message W2VPretrainedModel {
47
92
}
48
93
49
94
95
+ // option for training Word2Vec model
50
96
message CuW2VConfigProto {
51
- optional string data_path = 7 ;
52
- required string processed_data_dir = 6 ;
53
- optional bool skip_preprocess = 8 ;
54
-
97
+ // logging levels in python and C++
55
98
optional int32 py_log_level = 1 [default = 2 ];
56
99
optional int32 c_log_level = 2 [default = 2 ];
57
100
101
+ // raw data path (stream txt format)
102
+ optional string data_path = 7 ;
103
+
104
+ // path to save preprocessed data (hdf5 format)
105
+ optional string processed_data_dir = 6 ;
106
+
107
+ // skip data preprocessing (therefore, there should be
108
+ // already preprocessed hdf5 format file) if true
109
+ optional bool skip_preprocess = 8 ;
110
+
111
+ // number of embedding dimensions
58
112
optional int32 num_dims = 3 [default = 50 ];
113
+
114
+ // block_dim in CUDA
59
115
optional int32 block_dim = 4 [default = 32 ];
116
+
117
+ // set number of blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
60
118
optional int32 hyper_threads = 5 [default = 100 ];
119
+
120
+ // generate vocabulary with words appreared in corpus at least word_min_count times
61
121
optional int32 word_min_count = 9 [default = 5 ];
122
+
123
+ // batch size and number of epochs in training
62
124
optional int32 batch_size = 10 [default = 1000000 ];
63
125
optional int32 epochs = 11 [default = 10 ];
64
126
65
127
// seed fields
66
128
optional int32 seed = 14 [default = 777 ];
129
+
130
+ // random table size in negative sampling
67
131
optional int32 random_size = 12 [default = 100000000 ];
68
132
133
+ // number of negative samples
134
+ // if zero, it uses hierarchical softmax
69
135
optional int32 neg = 17 [default = 10 ];
70
- // as recommended in w2v paper
136
+
137
+ // weight in negative sampling will be word_count ** count_power for each word
138
+ // default value 0.75 is recommended in w2v paper
71
139
optional double count_power = 18 [default = 0.75 ];
140
+
141
+ // if true, train skip gram model, else train cbow model
72
142
optional bool skip_gram = 19 [default = true ];
143
+
144
+ // if true, use average context vector in cbow model
145
+ // else use summation of context vectors
73
146
optional bool cbow_mean = 20 [default = true ];
147
+
148
+ // learning rate
74
149
optional double lr = 21 [default = 0.001 ];
150
+
151
+ // window size in both skip gram and cbow model
75
152
optional int32 window_size = 22 [default = 5 ];
153
+
154
+ // remove all tempory directorys generated by package when program finnished if true
76
155
optional bool remove_tmp = 26 [default = true ];
77
156
78
157
optional IoUtilsConfigProto io = 24 ;
0 commit comments