-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_index.py
115 lines (90 loc) · 3.02 KB
/
build_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import os
import shutil
from typing import TYPE_CHECKING
import jieba
from jieba.analyse import ChineseAnalyzer
from whoosh.analysis import Token, Tokenizer
from whoosh.fields import NUMERIC, TEXT, Schema
from whoosh.index import create_in, open_dir
from zhconv import convert
from ocr import OCR_RESULTS_DIR
if TYPE_CHECKING:
from whoosh.index import Index
INDEX_DIR = "index"
PROB_THRESHOLD = 0.2
class DummyAnalyzer(Tokenizer):
def __call__(self, text, **kargs):
words = list(text)
cur_pos = 0
for word in words:
token = Token()
token.original = token.text = word
token.pos = cur_pos
token.startchar = cur_pos
token.endchar = cur_pos
cur_pos += 1
yield token
def validate():
ix = open_dir(INDEX_DIR)
expected = len(os.listdir(OCR_RESULTS_DIR))
actual = ix.searcher().doc_count()
return actual == expected, expected, actual
def load():
return open_dir(INDEX_DIR)
def build(quiet: bool = False, show_progress: bool = False) -> "Index":
shutil.rmtree(INDEX_DIR, ignore_errors=True)
jieba.load_userdict("dict.txt")
analyzer = ChineseAnalyzer()
schema = Schema(
vol=NUMERIC(stored=True, sortable=True),
page=NUMERIC(stored=True, sortable=True),
side=NUMERIC(stored=True, sortable=True),
content_t_cn=TEXT(stored=True, analyzer=analyzer),
content_s_cn=TEXT(stored=True, analyzer=analyzer),
content_raw=TEXT(stored=True, analyzer=DummyAnalyzer()),
)
os.mkdir(INDEX_DIR)
ix = create_in(INDEX_DIR, schema)
_build(ix, quiet, show_progress)
return ix
def _build(idx: "Index", quiet: bool, show_progress: bool):
writer = idx.writer()
files = os.listdir(OCR_RESULTS_DIR)
progress_bar = None
if show_progress:
import streamlit as st
st.info("構建索引中...")
progress_bar = st.progress(0)
for i in range(len(files)):
f = files[i]
if not f.endswith(".json"):
continue
if not quiet:
progress = float(i) / len(files)
print(f"{f}, progress: {i} / {len(files)}")
if progress_bar is not None:
progress_bar.progress(progress, text=f"({i} / {len(files)})")
with open(f"{OCR_RESULTS_DIR}/{f}") as fd:
j = json.load(fd)
chars = []
for char_id in j["char_ids"]:
if j["char_probs"][char_id] > PROB_THRESHOLD:
chars.append(j["chars"][char_id])
content_t_cn = "".join(chars)
content_s_cn = convert(content_t_cn, "zh-cn")
vol, page, side = f[:-5].split("_")
writer.add_document(
vol=vol,
page=page,
side=side,
content_t_cn=content_t_cn,
content_s_cn=content_s_cn,
content_raw=content_t_cn,
)
if show_progress:
import streamlit as st
st.info("寫入索引中...")
writer.commit()
if __name__ == "__main__":
build(quiet=False)