-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
117 lines (108 loc) · 3.59 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# coding = utf-8
import numpy as np
import jieba
import preprocess as p
import os
from keras.preprocessing.sequence import pad_sequences
# stop_word_file = 'dicts/stop_words.txt'
jieba.set_dictionary('/home/curry/NER/NER_Chinese-Text-master/dicts/dict.txt.big')
jieba.initialize()
word_embedding_file = 'data/word_embedding_matrix.npy'
def get_word_data(char_data):
"""
获取分词的结果
:param char_data:
:return:
"""
seq_data = [''.join(l) for l in char_data]
word_data = []
# stop_words = [line.strip() for line in open(stop_word_file, 'r', encoding='utf-8')]
for seq in seq_data:
seq_cut = jieba.cut(seq, cut_all=False)
word_data.append([w for w in seq_cut ])
return word_data
def get_word2object():
"""
获取预先得到的词向量
:return:
"""
word2vec = {}
f = open('Word2Vec/model/word_vec_word') # load pre-trained word embedding
i = 0
for line in f:
tep_list = line.split()
if i == 0:
n_word = int(tep_list[0])
n_embed = int(tep_list[1])
elif ord(tep_list[0][0]) > 122:
word = tep_list[0]
vec = np.asarray(tep_list[1:], dtype='float32')
word2vec[word] = vec
i += 1
f.close()
word2index = {k: i for i, k in enumerate(sorted(word2vec.keys()), 1)}
return word2vec, n_word, n_embed, word2index
def get_word_embedding_matrix(word2vec, n_vocab, n_embed, word2index):
"""
将词向量转换成矩阵
:param word2vec:
:param n_vocab:
:param n_embed:
:param word2index:
:return:
"""
embedding_mat = np.zeros([n_vocab, n_embed])
for w, i in word2index.items():
vec = word2vec.get(w)
if len(vec) == n_embed:
embedding_mat[i] = vec
if not os.path.exists(word_embedding_file):
np.save(word_embedding_file, embedding_mat)
return embedding_mat
def add_word_data(word_data, word2index, max_length):
"""
输入的数据,里面是每个词对应的索引
:param word_data:
:param word2index:
:param max_length:
:return:
"""
index_data = []
for l in word_data:
index_data.append([word2index[s] if word2index.get(s) is not None else 0
for s in l])
index_array = pad_sequences(index_data, maxlen=max_length, dtype='int32',
padding='post', truncating='post', value=0)
return index_array
# if __name__ == '__main__':
#
# char_train, tag_train = p.get_char_tag_data('data/allwords.txt')
#
# word_train = get_word_data(char_train)
# # print(word_train[100][:20])
#
# word2vec, n_word, n_embed, word2index = get_word2object()
# n_vocab = len(word2vec.keys()) + 1
# print(n_word, n_vocab, n_embed) # 332648, 157142, 300
#
# if os.path.exists(word_embedding_file):
# word_embedding_matrix = np.load(word_embedding_file)
# else:
# word_embedding_matrix = get_word_embedding_matrix(word2vec, n_vocab,
# n_embed, word2index)
#
# # length = [] # 分词且去掉停用词后,各文本中词语的频数统计
# # for data in [word_train, word_dev]:
# # for l in data:
# # length.append(len(l))
# # # print(max(length), length[800:1000]) # 348
# # count = 0
# # for k in length:
# # if k > 80:
# # count += 1
# # print(count, len(length)) # 64 23509
#
# word_index_train = add_word_data(word_train, word2index, 300)
# # (21147, 200) (2362, 200) (4706, 200)
#
# np.save('data/word_train_add.npy', word_index_train)