-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIR_engine.py
129 lines (99 loc) · 3.61 KB
/
IR_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""\
------------------------------------------------------------
USE: python <PROGNAME> (options)
OPTIONS:
-h : print this help message
-s : use "with stoplist" configuration (default: without)
-p : use "with stemming" configuration (default: without)
-w LABEL : use weighting scheme "LABEL" (LABEL in {binary, tf, tfidf}, default: binary)
-o FILE : output results to file FILE
------------------------------------------------------------\
"""
# ==============================================================================
# Importing
import sys
import getopt
import pickle
from my_retriever import Retrieve
# ==============================================================================
# Command line processing
def print_help():
progname = sys.argv[0]
progname = progname.split('/')[-1] # strip off extended path
help = __doc__.replace('<PROGNAME>', progname, 1)
print(help, file=sys.stderr)
class CommandLine:
def __init__(self):
opts, args = getopt.getopt(sys.argv[1:], 'hspw:o:')
opts = dict(opts)
self.exit = True
if '-h' in opts:
print_help()
return
if len(args) > 0:
print("*** ERROR: no arg files - only options! ***", file=sys.stderr)
print_help()
return
if '-w' in opts:
if opts['-w'] in ('binary', 'tf', 'tfidf'):
self.term_weighting = opts['-w']
else:
warning = (
"*** ERROR: term weighting label (opt: -w LABEL)! ***\n"
" -- value (%s) not recognised!\n"
" -- must be one of: binary / tf / tfidf"
) % (opts['-w'])
print(warning, file=sys.stderr)
print_help()
return
else:
self.term_weighting = 'binary'
if '-o' in opts:
self.outfile = opts['-o']
else:
print("*** ERROR: must specify output file (opt: -o FILE) ***",
file=sys.stderr)
print_help()
return
if '-s' in opts:
stoplist = 'yes'
else:
stoplist = 'no'
if '-p' in opts:
stemming = 'yes'
else:
stemming = 'no'
with open('IR_data.pickle', 'rb') as data_in:
all_data = pickle.load(data_in)
choice = 'index_stoplist_%s_stemming_%s' % (stoplist, stemming)
self.index = all_data[choice]
choice = 'queries_stoplist_%s_stemming_%s' % (stoplist, stemming)
self.queries = all_data[choice]
self.exit = False
# ==============================================================================
# Store for Retrieval Results
class ResultStore:
def __init__(self):
self.results = []
def store(self, qid, docids):
if len(docids) > 10:
docids = docids[:10]
self.results.append((qid, docids))
def output(self, outfile):
with open(outfile, 'w') as out:
for (qid, docids) in self.results:
for docid in docids:
print(qid, docid, file=out)
# ==============================================================================
# MAIN
if __name__ == '__main__':
config = CommandLine()
if config.exit:
sys.exit(0)
queries = config.queries
retrieve = Retrieve(config.index, config.term_weighting)
all_results = ResultStore()
for (qid, query) in queries:
results = retrieve.for_query(query)
all_results.store(qid, results)
all_results.output(config.outfile)