Skip to content

Commit fe04eb3

Browse files
committed
attempt to improve reflist and deflist performance
1 parent 8b339a4 commit fe04eb3

File tree

2 files changed

+60
-28
lines changed

2 files changed

+60
-28
lines changed

elixir/data.py

Lines changed: 52 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -64,24 +64,35 @@ class DefList:
6464
a line number and a file family.
6565
Also stores in which families the ident exists for faster tests.'''
6666
def __init__(self, data=b'#'):
67-
self.data, self.families = data.split(b'#')
67+
data, self.families = data.split(b'#')
68+
self.entries = [self.decode_entry(d) for d in deflist_regex.findall(data)]
69+
self.sorted = False
70+
71+
def decode_entry(self, entry):
72+
id = int(entry[0])
73+
type = defTypeR [entry[1].decode()]
74+
line = int(entry[2])
75+
family = entry[3].decode()
76+
return id, type, line, family
77+
78+
def encode_entry(self, entry):
79+
return str(entry[0]) + defTypeD[entry[1]] + str(entry[2]) + entry[3]
6880

6981
def iter(self, dummy=False):
7082
# Get all element in a list of sublists and sort them
71-
entries = deflist_regex.findall(self.data)
72-
entries.sort(key=lambda x:int(x[0]))
73-
for id, type, line, family in entries:
74-
id = int(id)
75-
type = defTypeR [type.decode()]
76-
line = int(line)
77-
family = family.decode()
78-
yield id, type, line, family
83+
if not self.sorted:
84+
self.entries.sort(key=lambda x:int(x[0]))
85+
self.sorted = True
86+
7987
if dummy:
88+
for id, type, line, family in self.entries:
89+
yield id, type, line, family
8090
yield maxId, None, None, None
91+
else:
92+
return self.entries
8193

8294
def exists(self, idx, line_num):
83-
entries = deflist_regex.findall(self.data)
84-
for id, _, line, _ in entries:
95+
for id, _, line, _ in self.entries:
8596
if id == idx and int(line) == line_num:
8697
return True
8798

@@ -90,14 +101,18 @@ def exists(self, idx, line_num):
90101
def append(self, id, type, line, family):
91102
if type not in defTypeD:
92103
return
93-
p = str(id) + defTypeD[type] + str(line) + family
94-
if self.data != b'':
95-
p = ',' + p
96-
self.data += p.encode()
104+
105+
self.sorted = False
106+
self.entries.append((id, type, line, family))
97107
self.add_family(family)
98108

99109
def pack(self):
100-
return self.data + b'#' + self.families
110+
if not self.sorted:
111+
self.entries.sort(key=lambda x:int(x[0]))
112+
self.sorted = True
113+
114+
data = ",".join(self.encode_entry(entry) for entry in self.entries)
115+
return data.encode() + b'#' + self.families
101116

102117
def add_family(self, family):
103118
family = family.encode()
@@ -110,7 +125,7 @@ def get_families(self):
110125
return self.families.decode().split(',')
111126

112127
def get_macros(self):
113-
return deflist_macro_regex.findall(self.data.decode()) or ''
128+
return [entry[3] for entry in self.entries if entry[1] == 'macro']
114129

115130
class PathList:
116131
'''Stores associations between a blob ID and a file path.
@@ -139,25 +154,36 @@ class RefList:
139154
and the corresponding family.'''
140155
def __init__(self, data=b''):
141156
self.data = data
157+
self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]]
158+
self.sorted = False
159+
160+
def decode_entry(self, k):
161+
return (int(k[0].decode()), k[1].decode(), k[2].decode())
142162

143163
def iter(self, dummy=False):
144164
# Split all elements in a list of sublists and sort them
145-
entries = [x.split(b':') for x in self.data.split(b'\n')[:-1]]
146-
entries.sort(key=lambda x:int(x[0]))
147-
for b, c, d in entries:
148-
b = int(b.decode())
149-
c = c.decode()
150-
d = d.decode()
165+
if not self.sorted:
166+
self.sorted = True
167+
self.entries.sort(key=lambda x:int(x[0]))
168+
169+
for b, c, d in self.entries:
151170
yield b, c, d
152171
if dummy:
153172
yield maxId, None, None
154173

155174
def append(self, id, lines, family):
156-
p = str(id) + ':' + lines + ':' + family + '\n'
157-
self.data += p.encode()
175+
self.sorted = False
176+
self.entries.append((id, lines, family))
158177

159178
def pack(self):
160-
return self.data
179+
if not self.sorted:
180+
self.sorted = True
181+
self.entries.sort(key=lambda x:int(x[0]))
182+
183+
result = ""
184+
for id, lines, family in self.entries:
185+
result += str(id) + ":" + lines + ":" + family + "\n"
186+
return result.encode()
161187

162188
class BsdDB:
163189
def __init__(self, filename, readonly, contentType, shared=False, cachesize=None):

elixir/update.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,10 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool):
306306
logger.info("docs done")
307307

308308
if dts_comp_support:
309-
for result in pool.imap_unordered(get_comps, idxes, chunksize):
309+
comp_idxes = [idx for idx in idxes if getFileFamily(idx[2]) not in (None, 'K', 'M')]
310+
comp_chunksize = int(len(comp_idxes) / cpu_count())
311+
comp_chunksize = min(max(1, comp_chunksize), 100)
312+
for result in pool.imap_unordered(get_comps, comp_idxes, comp_chunksize):
310313
if result is not None:
311314
add_comps(db, *result)
312315

@@ -318,7 +321,10 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool):
318321

319322
logger.info("dts comps docs done")
320323

321-
for result in pool.imap_unordered(get_refs, idxes, chunksize):
324+
ref_idxes = [idx for idx in idxes if getFileFamily(idx[2]) is not None]
325+
ref_chunksize = int(len(ref_idxes) / cpu_count())
326+
ref_chunksize = min(max(1, ref_chunksize), 100)
327+
for result in pool.imap_unordered(get_refs, ref_idxes, ref_chunksize):
322328
if result is not None:
323329
add_refs(db, idx_to_hash_and_filename, result)
324330

0 commit comments

Comments
 (0)