Skip to content

Commit 1d8d489

Browse files
committed
add 0.1.1
1 parent a274e73 commit 1d8d489

File tree

4 files changed

+192
-154
lines changed

4 files changed

+192
-154
lines changed

half_json/core.py

+14-154
Original file line numberDiff line numberDiff line change
@@ -1,164 +1,53 @@
11
# coding=utf8
2-
3-
import re
4-
import sys
5-
import traceback
6-
import functools
7-
import json.decoder
8-
9-
from json.decoder import JSONDecoder
10-
from json.scanner import py_make_scanner
11-
from json.decoder import py_scanstring
12-
13-
14-
# errmsg.inv
15-
def inv_errmsg(e, exc_info):
16-
exc_type, exc_value, exc_traceback_obj = exc_info
17-
18-
message = e.message
19-
# err, left = message.split(':', 1) # badcase Expecting ':' delimiter
20-
idx = message.rindex(':')
21-
errmsg = message[:idx]
22-
left = message[idx + 1:]
23-
numbers = re.compile(r'\d+').findall(left)
24-
result = {
25-
"errmsg": errmsg,
26-
"parser": e.__dict__.get("parser", ""),
27-
"lineno": int(numbers[0]),
28-
"colno": int(numbers[1]),
29-
}
30-
if len(numbers) == 3:
31-
result["pos"] = int(numbers[2])
32-
33-
if len(numbers) > 3:
34-
result["endlineno"] = int(numbers[2])
35-
result["endcolno"] = int(numbers[3])
36-
result["pos"] = int(numbers[4])
37-
result["end"] = int(numbers[5])
38-
return result
39-
40-
41-
# 记录 Exception 被哪个 parser 抛出的
42-
def add_parser_name(parser):
43-
44-
# @functools.wraps
45-
def new_parser(*args, **kwargs):
46-
try:
47-
return parser(*args, **kwargs)
48-
except Exception as e:
49-
if "parser" not in e.__dict__:
50-
e.__dict__["parser"] = parser.__name__
51-
raise e
52-
return new_parser
53-
54-
55-
def make_decoder():
56-
# json.decoder.scanstring = py_scanstring
57-
58-
decoder = JSONDecoder()
59-
decoder.parse_object = add_parser_name(decoder.parse_object)
60-
decoder.parse_array = add_parser_name(decoder.parse_array)
61-
decoder.parse_string = add_parser_name(py_scanstring)
62-
decoder.parse_object = add_parser_name(decoder.parse_object)
63-
64-
decoder.scan_once = py_make_scanner(decoder)
65-
66-
json.decoder.scanstring = add_parser_name(py_scanstring)
67-
return decoder
68-
69-
70-
decoder = make_decoder()
71-
72-
"""
73-
ValueError 抛出
74-
01. _decode_uXXXX "Invalid \\uXXXX escape"
75-
02. py_scanstring "Unterminated string starting at"
76-
03. py_scanstring "Invalid control character {0!r} at".format(terminator)
77-
04. py_scanstring "Unterminated string starting at"
78-
05. py_scanstring "Invalid \\escape: " + repr(esc)
79-
06. JSONObject "Expecting property name enclosed in double quotes"
80-
07. JSONObject "Expecting ':' delimiter"
81-
08. JSONObject "Expecting object"
82-
09. JSONObject "Expecting ',' delimiter"
83-
10. JSONObject "Expecting property name enclosed in double quotes"
84-
11. JSONArray "Expecting object"
85-
12. JSONArray "Expecting ',' delimiter"
86-
87-
01 先不看,不研究
88-
02 badcase: " --> "" success
89-
03 控制符 pass
90-
04 unicode \\u 的 pass
91-
05 同上
92-
06 object 后面没有跟随 " , badcase: {abc":1} --> {"abc":1}
93-
07 object key 后面没有 : , badcase: {"abc"1} --> {"abc":1}
94-
08 object 开始检测 Value 收到 StopIteration
95-
08.1 要么后面没有了
96-
08.2 要么后面不是 "/{/[/n[ull]/t[rue]/f[alse]/number/NaN/Infinity/-Infinity 开头的东西
97-
-- 08.1 后面补上 null}
98-
-- 08.2 无脑补一个 "
99-
09 object 解析完一个 pair 后,下一个不是}, 期待一个 ','
100-
badcase {"k":1"s":2}
101-
10 在 09 的基础上解析完{"k":1, 发现下一个不是 ", 这个后面再优化(暂时和 06 一致)
102-
badcase {"k":1,x":2}
103-
11 array 开始检测 Value 收到 StopIteration
104-
11.1 要么后面没有了,补上]
105-
11.2 同 08.2,无脑补一个{ 看看
106-
12 array 解析完前一个 object, 需要一个 ,
107-
这里 nextchar 既不是 ] 也不是, 代表这个 nextchar 的 end 也已经+1 了,所以减 2
108-
"""
109-
110-
def process_number():
111-
pass
2+
from half_json.json_util import decoder
3+
from half_json.json_util import errmsg_inv
4+
from half_json.json_util import errors
1125

1136

1147
def find_stop(line):
1158
try:
116-
import pdb
117-
pdb.set_trace()
118-
1199
# 暂时只考虑 1 行的情况
12010
obj, end = decoder.scan_once(line, 0)
12111
# TODO end is only part of line
12212
return end == len(line), line
12313
except StopIteration as e:
12414
return True, ""
12515
except ValueError as e:
126-
err_info = inv_errmsg(e, sys.exc_info())
16+
err_info = errmsg_inv(e)
17+
error = err_info["error"]
12718
pos = err_info["pos"]
12819
nextchar = line[pos: pos+1]
129-
parser = err_info["parser"]
130-
errmsg = err_info["errmsg"]
131-
lastchar = line[pos-1: pos]
20+
# lastchar = line[pos-1: pos]
13221

13322
# 02
134-
if errmsg == "Unterminated string starting at":
23+
if error == errors.StringUnterminatedString:
13524
# TODO resolve "abc --> "abc"
13625
return False, insert_line(line, "\"", len(line))
13726
# 06
138-
if errmsg == "Expecting property name enclosed in double quotes":
27+
if error == errors.ObjectExceptKey:
13928
# lastchar = line[pos-1: pos]
14029
# for case {
14130
# if lastchar == "{" and all([c not in line for c in '"}:']):
14231
# return False, insert_line(line, "}", pos)
14332
return False, insert_line(line, "\"", pos)
14433
# 07
145-
if errmsg == "Expecting ':' delimiter":
34+
if error == errors.ObjectExceptColon:
14635
return False, insert_line(line, ":", pos)
14736
# 08
148-
if parser == "JSONObject" and errmsg == "Expecting object":
37+
if error == errors.ObjectExceptObject:
14938
# 08.1
15039
if nextchar == "":
15140
return False, insert_line(line, "null}", pos)
15241
# 08.2
15342
else:
15443
return False, insert_line(line, "\"", pos)
15544
# 09
156-
if parser == "JSONObject" and errmsg == "Expecting ',' delimiter":
45+
if error == errors.ObjectExceptComma:
15746
if nextchar == "":
15847
return False, insert_line(line, "}", pos)
15948
return False, insert_line(line, ",", pos)
16049
# 11
161-
if parser == "JSONArray" and errmsg == "Expecting object":
50+
if error == errors.ArrayExceptObject:
16251
# ?
16352
if nextchar == ",":
16453
return False, insert_line(line, "null", pos)
@@ -170,7 +59,7 @@ def find_stop(line):
17059
return False, insert_line(line, "{", pos)
17160
# 也许可以删掉前面的 , 补一个]
17261
# 12
173-
if parser == "JSONArray" and errmsg == "Expecting ',' delimiter":
62+
if error == errors.ArrayExceptComma:
17463
"""
17564
code:
17665
end += 1
@@ -180,7 +69,7 @@ def find_stop(line):
18069
raise ValueError(errmsg("Expecting ',' delimiter", s, end))
18170
"""
18271
pos = pos - 1
183-
nextchar = line[pos: pos +1]
72+
nextchar = line[pos: pos + 1]
18473
# 11.1
18574
if nextchar == "":
18675
return False, insert_line(line, "]", pos)
@@ -201,32 +90,3 @@ def clear(line):
20190
if ok:
20291
break
20392
return ok, line
204-
205-
206-
def main(infile, outfile):
207-
inf = open(infile, 'r')
208-
outf = open(outfile, 'w')
209-
output = sys.stdout
210-
211-
total = 0
212-
hit = 0
213-
214-
for line in inf:
215-
try:
216-
total += 1
217-
line = line.strip()
218-
ok, new_line = clear(line)
219-
if ok:
220-
outf.write(new_line + "\n")
221-
hit += 1
222-
else:
223-
print(ok, line, new_line)
224-
except Exception as e:
225-
print(e, line)
226-
print("total is {} and hit {} --> ratio:{} \n".format(total, hit, hit*1.0/total))
227-
inf.close()
228-
outf.close()
229-
230-
231-
if __name__ == '__main__':
232-
main(sys.argv[1], sys.argv[2])

half_json/json_util.py

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# coding=utf8
2+
3+
import re
4+
5+
import json.decoder
6+
from json.decoder import JSONDecoder
7+
from json.scanner import py_make_scanner
8+
from json.decoder import py_scanstring
9+
10+
11+
class JSONDecodeError(object):
12+
13+
def __init__(self, parser, message):
14+
self.message = message
15+
self.parser = parser
16+
17+
def __eq__(self, err):
18+
return err.parser == self.parser and self.message in err.message
19+
20+
21+
class errors(object):
22+
23+
StringInvalidUXXXXEscape = JSONDecodeError("py_scanstring", "Invalid \\uXXXX escape")
24+
# 2 different case
25+
StringUnterminatedString = JSONDecodeError("py_scanstring", "Unterminated string starting at")
26+
StringInvalidControlCharacter = JSONDecodeError("py_scanstring", "Invalid control character")
27+
StringInvalidEscape = JSONDecodeError("py_scanstring", "Invalid \\escape")
28+
ObjectExceptColon = JSONDecodeError("JSONObject", "Expecting ':' delimiter")
29+
ObjectExceptObject = JSONDecodeError("JSONObject", "Expecting object")
30+
# 2 different case
31+
ObjectExceptKey = JSONDecodeError("JSONObject", "Expecting property name enclosed in double quotes")
32+
ObjectExceptComma = JSONDecodeError("JSONObject", "Expecting ',' delimiter")
33+
ArrayExceptObject = JSONDecodeError("JSONArray", "Expecting object")
34+
ArrayExceptComma = JSONDecodeError("JSONArray", "Expecting ',' delimiter")
35+
36+
"""
37+
01 先不看,不研究
38+
02 badcase: " --> "" success
39+
03 控制符 pass
40+
04 unicode \\u 的 pass
41+
05 同上
42+
06 object 后面没有跟随 " , badcase: {abc":1} --> {"abc":1}
43+
07 object key 后面没有 : , badcase: {"abc"1} --> {"abc":1}
44+
08 object 开始检测 Value 收到 StopIteration
45+
08.1 要么后面没有了
46+
08.2 要么后面不是 "/{/[/n[ull]/t[rue]/f[alse]/number/NaN/Infinity/-Infinity 开头的东西
47+
-- 08.1 后面补上 null}
48+
-- 08.2 无脑补一个 "
49+
09 object 解析完一个 pair 后,下一个不是}, 期待一个 ','
50+
badcase {"k":1"s":2}
51+
10 在 09 的基础上解析完{"k":1, 发现下一个不是 ", 这个后面再优化(暂时和 06 一致)
52+
badcase {"k":1,x":2}
53+
11 array 开始检测 Value 收到 StopIteration
54+
11.1 要么后面没有了,补上]
55+
11.2 同 08.2,无脑补一个{ 看看
56+
12 array 解析完前一个 object, 需要一个 ,
57+
这里 nextchar 既不是 ] 也不是, 代表这个 nextchar 的 end 也已经+1 了,所以减 2
58+
"""
59+
60+
61+
def errmsg_inv(e):
62+
assert isinstance(e, ValueError)
63+
64+
message = e.message
65+
idx = message.rindex(':')
66+
errmsg, left = message[:idx], message[idx + 1:]
67+
numbers = re.compile(r'\d+').findall(left)
68+
parser = e.__dict__.get("parser", "")
69+
70+
result = {
71+
"error": JSONDecodeError(parser, errmsg),
72+
"lineno": int(numbers[0]),
73+
"colno": int(numbers[1]),
74+
}
75+
76+
if len(numbers) == 3:
77+
result["pos"] = int(numbers[2])
78+
79+
if len(numbers) > 3:
80+
result["endlineno"] = int(numbers[2])
81+
result["endcolno"] = int(numbers[3])
82+
result["pos"] = int(numbers[4])
83+
result["end"] = int(numbers[5])
84+
return result
85+
86+
87+
def record_parser_name(parser):
88+
89+
def new_parser(*args, **kwargs):
90+
try:
91+
return parser(*args, **kwargs)
92+
except Exception as e:
93+
if "parser" not in e.__dict__:
94+
e.__dict__["parser"] = parser.__name__
95+
raise e
96+
return new_parser
97+
98+
99+
def make_decoder():
100+
json.decoder.scanstring = record_parser_name(py_scanstring)
101+
102+
decoder = JSONDecoder()
103+
decoder.parse_object = record_parser_name(decoder.parse_object)
104+
decoder.parse_array = record_parser_name(decoder.parse_array)
105+
decoder.parse_string = record_parser_name(py_scanstring)
106+
decoder.parse_object = record_parser_name(decoder.parse_object)
107+
108+
decoder.scan_once = py_make_scanner(decoder)
109+
return decoder
110+
111+
112+
decoder = make_decoder()

half_json/main.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# coding=utf8
2+
import sys
3+
4+
from half_json.core import clear
5+
6+
7+
def fixjson():
8+
infile = sys.argv[1]
9+
outfile = sys.argv[2]
10+
11+
inf = open(infile, 'r')
12+
outf = open(outfile, 'w')
13+
14+
total = 0
15+
hit = 0
16+
17+
for line in inf:
18+
try:
19+
total += 1
20+
line = line.strip()
21+
ok, new_line = clear(line)
22+
if ok:
23+
outf.write(new_line + "\n")
24+
hit += 1
25+
else:
26+
print(ok, line, new_line)
27+
except Exception as e:
28+
print(e, line)
29+
print("total is {} and hit {} --> ratio:{} \n".format(total, hit, hit*1.0/total))
30+
inf.close()
31+
outf.close()

0 commit comments

Comments
 (0)