Skip to content

Commit 817260e

Browse files
committed
almost done
0 parents  commit 817260e

File tree

5 files changed

+1156
-0
lines changed

5 files changed

+1156
-0
lines changed

merge_index.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import os,sys
2+
from os.path import isfile, join
3+
import timeit
4+
from collections import defaultdict
5+
from heapq import heapify, heappush, heappop
6+
7+
indexFolder = "./merged_index/"
8+
index_file_count = 0
9+
secondary_index = {}
10+
chunk_size = 100000
11+
12+
index_files_location = "./indexFiles/"
13+
index_files = [join(index_files_location,f) for f in os.listdir(index_files_location) if isfile(join(index_files_location, f))]
14+
no_of_index_file = len(index_files)
15+
is_completed = [False for i in range(no_of_index_file)]
16+
17+
file_pointers = {}
18+
current_row_of_file = {}
19+
k_way_heap = list()
20+
words = {}
21+
total = 0
22+
invertedIndex = defaultdict()
23+
24+
25+
def update_primary_index():
26+
global index_file_count
27+
update_secondary = True
28+
index_file_count = index_file_count + 1
29+
fileName = indexFolder + "index" + str(index_file_count) + ".txt"
30+
fp = open(fileName,"w")
31+
for i in sorted(invertedIndex):
32+
if update_secondary:
33+
secondary_index[i] = index_file_count
34+
update_secondary = False
35+
36+
toWrite = str(i) + "=" + invertedIndex[i] + "\n"
37+
fp.write(toWrite)
38+
39+
def update_secondary_index():
40+
fileName = indexFolder + "secondary_index.txt"
41+
fp = open(fileName,"w")
42+
for i in sorted(secondary_index):
43+
toWrite = str(i) + " " + str(secondary_index[i]) + "\n"
44+
fp.write(toWrite)
45+
46+
start = timeit.default_timer()
47+
48+
for i in range(no_of_index_file):
49+
is_completed[i] = True
50+
try:
51+
file_pointers[i] = open(index_files[i],"r")
52+
except:
53+
print("problem in opening file ")
54+
current_row_of_file[i] = file_pointers[i].readline()
55+
words[i] = current_row_of_file[i].strip().split("=")
56+
if words[i][0] not in k_way_heap:
57+
heappush(k_way_heap,words[i][0])
58+
59+
while is_completed.count(False) != no_of_index_file:
60+
total = total + 1
61+
word = heappop(k_way_heap)
62+
for i in range(no_of_index_file):
63+
if is_completed[i] and words[i][0] == word:
64+
if word not in invertedIndex:
65+
invertedIndex[word] = words[i][1]
66+
else:
67+
invertedIndex[word] += ","+words[i][1]
68+
69+
current_row_of_file[i] = file_pointers[i].readline().strip()
70+
71+
if current_row_of_file[i]:
72+
words[i] = current_row_of_file[i].split("=")
73+
if words[i][0] not in k_way_heap:
74+
heappush(k_way_heap,words[i][0])
75+
else:
76+
is_completed[i] = False
77+
file_pointers[i].close()
78+
# os.remove(index_files[i])
79+
if total >= chunk_size:
80+
total = 0
81+
update_primary_index()
82+
invertedIndex.clear()
83+
84+
update_primary_index()
85+
update_secondary_index()
86+
87+
stop = timeit.default_timer()
88+
89+
print ("Time for Merging:",stop-start," seconds.")
90+
mins = float(stop-start)/float(60)
91+
print ("Time for Merging:",mins," Minutes.")
92+
hrs = float(mins)/float(60)
93+
print ("Time for Merging:",hrs," Hours.")
94+
print ("Check the External File(s) Now!")

0 commit comments

Comments
 (0)