|
| 1 | +import os,sys |
| 2 | +from os.path import isfile, join |
| 3 | +import timeit |
| 4 | +from collections import defaultdict |
| 5 | +from heapq import heapify, heappush, heappop |
| 6 | + |
| 7 | +indexFolder = "./merged_index/" |
| 8 | +index_file_count = 0 |
| 9 | +secondary_index = {} |
| 10 | +chunk_size = 100000 |
| 11 | + |
| 12 | +index_files_location = "./indexFiles/" |
| 13 | +index_files = [join(index_files_location,f) for f in os.listdir(index_files_location) if isfile(join(index_files_location, f))] |
| 14 | +no_of_index_file = len(index_files) |
| 15 | +is_completed = [False for i in range(no_of_index_file)] |
| 16 | + |
| 17 | +file_pointers = {} |
| 18 | +current_row_of_file = {} |
| 19 | +k_way_heap = list() |
| 20 | +words = {} |
| 21 | +total = 0 |
| 22 | +invertedIndex = defaultdict() |
| 23 | + |
| 24 | + |
| 25 | +def update_primary_index(): |
| 26 | + global index_file_count |
| 27 | + update_secondary = True |
| 28 | + index_file_count = index_file_count + 1 |
| 29 | + fileName = indexFolder + "index" + str(index_file_count) + ".txt" |
| 30 | + fp = open(fileName,"w") |
| 31 | + for i in sorted(invertedIndex): |
| 32 | + if update_secondary: |
| 33 | + secondary_index[i] = index_file_count |
| 34 | + update_secondary = False |
| 35 | + |
| 36 | + toWrite = str(i) + "=" + invertedIndex[i] + "\n" |
| 37 | + fp.write(toWrite) |
| 38 | + |
| 39 | +def update_secondary_index(): |
| 40 | + fileName = indexFolder + "secondary_index.txt" |
| 41 | + fp = open(fileName,"w") |
| 42 | + for i in sorted(secondary_index): |
| 43 | + toWrite = str(i) + " " + str(secondary_index[i]) + "\n" |
| 44 | + fp.write(toWrite) |
| 45 | + |
| 46 | +start = timeit.default_timer() |
| 47 | + |
| 48 | +for i in range(no_of_index_file): |
| 49 | + is_completed[i] = True |
| 50 | + try: |
| 51 | + file_pointers[i] = open(index_files[i],"r") |
| 52 | + except: |
| 53 | + print("problem in opening file ") |
| 54 | + current_row_of_file[i] = file_pointers[i].readline() |
| 55 | + words[i] = current_row_of_file[i].strip().split("=") |
| 56 | + if words[i][0] not in k_way_heap: |
| 57 | + heappush(k_way_heap,words[i][0]) |
| 58 | + |
| 59 | +while is_completed.count(False) != no_of_index_file: |
| 60 | + total = total + 1 |
| 61 | + word = heappop(k_way_heap) |
| 62 | + for i in range(no_of_index_file): |
| 63 | + if is_completed[i] and words[i][0] == word: |
| 64 | + if word not in invertedIndex: |
| 65 | + invertedIndex[word] = words[i][1] |
| 66 | + else: |
| 67 | + invertedIndex[word] += ","+words[i][1] |
| 68 | + |
| 69 | + current_row_of_file[i] = file_pointers[i].readline().strip() |
| 70 | + |
| 71 | + if current_row_of_file[i]: |
| 72 | + words[i] = current_row_of_file[i].split("=") |
| 73 | + if words[i][0] not in k_way_heap: |
| 74 | + heappush(k_way_heap,words[i][0]) |
| 75 | + else: |
| 76 | + is_completed[i] = False |
| 77 | + file_pointers[i].close() |
| 78 | + # os.remove(index_files[i]) |
| 79 | + if total >= chunk_size: |
| 80 | + total = 0 |
| 81 | + update_primary_index() |
| 82 | + invertedIndex.clear() |
| 83 | + |
| 84 | +update_primary_index() |
| 85 | +update_secondary_index() |
| 86 | + |
| 87 | +stop = timeit.default_timer() |
| 88 | + |
| 89 | +print ("Time for Merging:",stop-start," seconds.") |
| 90 | +mins = float(stop-start)/float(60) |
| 91 | +print ("Time for Merging:",mins," Minutes.") |
| 92 | +hrs = float(mins)/float(60) |
| 93 | +print ("Time for Merging:",hrs," Hours.") |
| 94 | +print ("Check the External File(s) Now!") |
0 commit comments