-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwordGenerator.py
More file actions
108 lines (78 loc) · 2.38 KB
/
wordGenerator.py
File metadata and controls
108 lines (78 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import numpy as np
from numpy.random import choice, seed
import constants as const
####################################
#Configuration parameters
SEED = None
GENERATE_LIMIT = 600 #Number of generated words
NEWLINE = ord("\n") #10
WORD_MIN_LENGTH = 6
WORD_MAX_LENGTH = 12
####################################
seed(SEED)
dirname = os.path.dirname(__file__)
sourceFile = os.path.join(dirname, const.sourceFile)
outfile = os.path.join(dirname, const.outfile)
chainTable = os.path.join(dirname, const.chainTable)
#Construct a list of original word
import re
pattern = re.compile("[,]")
originalWords = set()
with open(sourceFile, "r") as lines:
for l in lines:
l = pattern.split(l)[0]
originalWords.add(l)
#pull back the markov chain table from file
count = np.fromfile(chainTable, dtype="int32")
count = count.reshape(const.arraySize, const.arraySize, const.arraySize)
s = count.sum(axis = 2)
st = np.tile(s.T, (const.arraySize, 1, 1)).T
p = count.astype('float') / st
p[np.isnan(p)] = 0
# create a container with a set of words
container = {}
a = range(const.arraySize)
while True:
i = 0
j = 0
word = ""
while True:
#get a random character according to a defined probability
randomSample = choice(a, 1, p = p[i, j, :])
k = randomSample[0]
if k != NEWLINE:
word += chr(k)
i = j
j = k
else:
break
wordLen = len(word)
#If not within expected length then skip
if wordLen < WORD_MIN_LENGTH or wordLen > WORD_MAX_LENGTH:
continue
generatedWordOccurences = container.get(word, 0)
#Mark words comming from the original list
if word in originalWords:
word += "*"
#Count the number of generated occurences
container[word] = generatedWordOccurences + 1
#Stop when we reach the limit
if len(container) >= GENERATE_LIMIT:
break
outFile = open(outfile, "w")
#Order by words
from collections import OrderedDict
d = OrderedDict(sorted(container.items(), key=lambda t: t[0]))
for w, v in d.items():
print(w, " - ", v)
outFile.write(w, )
outFile.write("\n")
outFile.close()
print(" -------------------------------------------- ")
#Order by generations
d = OrderedDict(sorted(container.items(), reverse=True, key=lambda x: x[1]))
for w, v in d.items():
print(w, " - ", v)
if v == 1:
break