-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathfindbadlines.py
More file actions
38 lines (28 loc) · 831 Bytes
/
findbadlines.py
File metadata and controls
38 lines (28 loc) · 831 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
This script tries to find encoding errors in stdin and prints out the bad lines.
Usage:
python3 findbadlines.py [encoding]
The default encoding is utf-8.
'''
import sys
import codecs
import unicodedata
def replace_escape(ex):
r = '\33[7m%s\33[0m' % ''.join('\\x%x' % b for b in ex.object[ex.start:ex.end])
return r, ex.end
codecs.register_error('replace_escape', replace_escape)
encoding = 'utf-8'
if len(sys.argv) > 1:
encoding = sys.argv[1]
error = False
for n, ln in enumerate(sys.stdin.buffer, 1):
try:
ln.decode(encoding)
except UnicodeDecodeError:
error = True
sys.stdout.write('===== Bad line %d =====\n' % n)
sys.stdout.write(ln.decode(encoding, 'replace_escape'))
sys.stdout.flush()
sys.exit(error)