Skip to content

Commit 3850dbf

Browse files
committed
Minor style tweaks
1 parent ba36a4d commit 3850dbf

21 files changed

+67
-50
lines changed

README.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,7 @@ These instructions were tested on Mac OS X 10.12. Your mileage may vary. Correct
6767

6868
Help and pull requests are welcomed!
6969

70-
- Move some legacy scripts like `build-kobo`, and `hyphenate` into appropriate libraries/scripts.
71-
72-
- Update scripts to use new library global variables like `XHTML_NAMESPACES`, instead of redefining them in each script.
70+
- Move some legacy scripts like `build-kobo` and `hyphenate` into appropriate libraries/scripts.
7371

7472
# Tool descriptions
7573

british2american

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@ import argparse
44
import os
55
import fnmatch
66
import regex
7-
8-
9-
IGNORED_FILENAMES = ["colophon.xhtml", "titlepage.xhtml", "imprint.xhtml", "uncopyright.xhtml", "halftitle.xhtml", "toc.xhtml", "loi.xhtml"]
7+
import se
108

119

1210
def main():
@@ -26,7 +24,7 @@ def main():
2624
if os.path.isdir(target):
2725
for root, _, filenames in os.walk(target):
2826
for filename in fnmatch.filter(filenames, "*.xhtml"):
29-
if filename not in IGNORED_FILENAMES:
27+
if filename not in se.IGNORED_FILENAMES:
3028
target_filenames.add(os.path.join(root, filename))
3129
else:
3230
target_filenames.add(target)

build-images

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import subprocess
1010
import regex
1111
import se
1212

13+
1314
def clean_inkscape_svg(filename, clean_path):
1415
with open(filename, "r+") as file:
1516
svg = file.read()
@@ -148,5 +149,6 @@ def main():
148149
if args.verbose:
149150
print("\t./images/titlepage.svg not found, skipping ...")
150151

152+
151153
if __name__ == "__main__":
152154
main()

clean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import subprocess
88
import regex
99
import se
1010

11+
1112
def main():
1213
parser = argparse.ArgumentParser(description="Prettify and canonicalize individual XHTML or SVG files, or all XHTML and SVG files in a source directory. Note that this only prettifies the source code; it doesn't perform typography changes.")
1314
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")

dec2roman

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import sys
55
import roman
66
import se
77

8+
89
def main():
910
parser = argparse.ArgumentParser(description="Convert a decimal number to a Roman numeral.")
1011
parser.add_argument("-n", "--no-newline", dest="newline", action="store_false", help="don't end output with a newline")
@@ -30,5 +31,6 @@ def main():
3031
else:
3132
print(roman.toRoman(int(line)), end="")
3233

34+
3335
if __name__ == "__main__":
3436
main()

extract-ebook

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import magic
99
from se.kindleunpack import kindleunpack # GPLv3: https://www.mobileread.com/forums/showthread.php?t=61986
1010
import se
1111

12+
1213
def main():
1314
parser = argparse.ArgumentParser(description="Extract an EPUB, MOBI, or AZW3 ebook into ./FILENAME.extracted/ or a target directory.")
1415
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
@@ -55,5 +56,6 @@ def main():
5556
if args.verbose:
5657
print(" OK")
5758

59+
5860
if __name__ == "__main__":
5961
main()

find-mismatched-diacritics

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import fnmatch
66
import unicodedata
77
import regex
88

9+
910
def main():
1011
parser = argparse.ArgumentParser(description="Find words with mismatched diacritics in XHTML files. For example, \"cafe\" in one file and \"café\" in another.")
1112
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
@@ -68,5 +69,6 @@ def main():
6869
if args.verbose:
6970
print(" OK")
7071

72+
7173
if __name__ == "__main__":
7274
main()

hyphenate

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import regex
88
from hyphen import Hyphenator, dict_info
99
from bs4 import BeautifulSoup
1010

11+
1112
def main():
1213
parser = argparse.ArgumentParser(description="Insert soft hyphens at syllable breaks in XHTML files.")
1314
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
@@ -139,5 +140,6 @@ def main():
139140
if args.verbose:
140141
print(" OK")
141142

143+
142144
if __name__ == "__main__":
143145
main()

make-url-safe

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import argparse
44
import sys
55
import se.formatting
66

7+
78
def main():
89
parser = argparse.ArgumentParser(description="Make a string URL-safe.")
910
parser.add_argument("-n", "--no-newline", dest="newline", action="store_false", help="don't end output with a newline")
@@ -25,5 +26,6 @@ def main():
2526
else:
2627
print(se.formatting.make_url_safe(line), end="")
2728

29+
2830
if __name__ == "__main__":
2931
main()

prepare-release

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,5 +100,6 @@ def main():
100100
file.write(processed_xhtml)
101101
file.truncate()
102102

103+
103104
if __name__ == "__main__":
104105
main()

print-manifest-and-spine

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,6 @@ def main():
2525
if args.manifest:
2626
print(se_epub.generate_spine())
2727

28+
2829
if __name__ == "__main__":
2930
main()

reading-ease

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ from bs4 import BeautifulSoup
1010
import se
1111

1212

13-
IGNORED_FILENAMES = ["colophon.xhtml", "titlepage.xhtml", "imprint.xhtml", "uncopyright.xhtml", "halftitle.xhtml", "toc.xhtml", "loi.xhtml"]
1413
INCLUDED_CHARACTERS = list(string.whitespace) + list(string.digits) + [":", ";", ".", "?", "!"]
1514

1615

@@ -149,7 +148,7 @@ def main():
149148

150149
for root, _, filenames in os.walk(directory):
151150
for filename in fnmatch.filter(filenames, "*.xhtml"):
152-
if filename not in IGNORED_FILENAMES:
151+
if filename not in se.IGNORED_FILENAMES:
153152
with open(os.path.join(root, filename), "r", encoding="utf-8") as file:
154153
text += BeautifulSoup(file.read(), "lxml").body.get_text() + " "
155154

@@ -192,5 +191,6 @@ def main():
192191
else:
193192
print(flesch_reading_ease)
194193

194+
195195
if __name__ == "__main__":
196196
main()

reorder-endnotes

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ def is_positive_integer(value):
1515

1616
return int_value
1717

18-
1918
def main():
2019
parser = argparse.ArgumentParser(description="Increment the specified endnote and all following endnotes by 1.")
2120
group = parser.add_mutually_exclusive_group(required=True)

roman2dec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,6 @@ def main():
3232
se.print_error("Not a Roman numeral: {}".format(line))
3333
exit(1)
3434

35+
3536
if __name__ == "__main__":
3637
main()

semanticate

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import fnmatch
66
import regex
77
from bs4 import BeautifulSoup
88

9+
910
def main():
1011
parser = argparse.ArgumentParser(description="Automatically add semantics to Standard Ebooks source directories.")
1112
parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
@@ -102,5 +103,6 @@ def main():
102103
if args.verbose:
103104
print(" OK")
104105

106+
105107
if __name__ == "__main__":
106108
main()

simplify-tags

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,5 +192,6 @@ def main():
192192
if args.verbose:
193193
print(" OK")
194194

195+
195196
if __name__ == "__main__":
196197
main()

split-file

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import os
55
import regex
66
import se.epub
77

8+
89
def main():
910
parser = argparse.ArgumentParser(description="Split an XHTML file into many files at all instances of <!--se:split-->, and include a header template for each file.")
1011
parser.add_argument("filename", metavar="FILE", help="an XHTML file")
@@ -42,5 +43,6 @@ def output(chapter_number, header_xhtml, chapter_xhtml):
4243
file.write(header_xhtml.replace("NUMBER", str(chapter_number)) + "\n" + chapter_xhtml + "\n</section></body></html>")
4344
file.truncate()
4445

46+
4547
if __name__ == "__main__":
4648
main()

titlecase

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,6 @@ def main():
2626
else:
2727
print(se.formatting.titlecase(line), end="")
2828

29+
2930
if __name__ == "__main__":
3031
main()

typogrify

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,8 @@ import fnmatch
66
import html
77
import regex
88
import smartypants
9+
import se
910

10-
# Some convenience aliases
11-
WORD_JOINER = "\u2060" # word joiner, U+2060
12-
HAIR_SPACE = "\u200a" # hair space, U+200A
13-
NBSP = "\u00a0" # no-break space, U+00A0
1411

1512
def main():
1613
parser = argparse.ArgumentParser(description="Apply some scriptable typography rules from the Standard Ebooks typography manual to XHTML files.")
@@ -63,47 +60,47 @@ def main():
6360
processed_xhtml = regex.sub(r"—”([a-z])", r"—“\1", processed_xhtml, flags=regex.IGNORECASE)
6461
processed_xhtml = regex.sub(r"—’([a-z])", r"—‘\1", processed_xhtml, flags=regex.IGNORECASE)
6562
processed_xhtml = regex.sub(r"-“</p>", r"—”</p>", processed_xhtml, flags=regex.IGNORECASE)
66-
processed_xhtml = regex.sub(r"‘”</p>", r"’{}”</p>".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
63+
processed_xhtml = regex.sub(r"‘”</p>", r"’{}”</p>".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
6764

6865
# Remove spaces between en and em dashes
6966
# Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
7067
processed_xhtml = regex.sub(r"([^\.\s])\s*([–—])\s*", r"\1\2", processed_xhtml)
7168

7269
# First, remove stray word joiners
73-
processed_xhtml = processed_xhtml.replace(WORD_JOINER, "")
70+
processed_xhtml = processed_xhtml.replace(se.WORD_JOINER, "")
7471

7572
# Some older texts use the ,— construct; remove that archaichism
7673
processed_xhtml = processed_xhtml.replace(",—", "—")
7774

7875
# Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
79-
processed_xhtml = regex.sub(r"([^\s{}{}{}])([—⸻])".format(WORD_JOINER, NBSP, HAIR_SPACE), r"\1{}\2".format(WORD_JOINER), processed_xhtml, flags=regex.IGNORECASE)
76+
processed_xhtml = regex.sub(r"([^\s{}{}{}])([—⸻])".format(se.WORD_JOINER, se.NO_BREAK_SPACE, se.HAIR_SPACE), r"\1{}\2".format(se.WORD_JOINER), processed_xhtml, flags=regex.IGNORECASE)
8077

8178
# Add en dashes; don't replace match that is within an html tag, since ids and attrs often containg the pattern DIGIT-DIGIT
8279
processed_xhtml = regex.sub(r"(?<!<[^>]*)([0-9]+)\-([0-9]+)", r"\1–\2", processed_xhtml)
8380

8481
# Add a word joiner on both sides of en dashes
85-
processed_xhtml = regex.sub(r"{}?–{}?".format(WORD_JOINER, WORD_JOINER), r"{}–{}".format(WORD_JOINER, WORD_JOINER), processed_xhtml)
82+
processed_xhtml = regex.sub(r"{}?–{}?".format(se.WORD_JOINER, se.WORD_JOINER), r"{}–{}".format(se.WORD_JOINER, se.WORD_JOINER), processed_xhtml)
8683

8784
# Add a word joiner if eliding a word with a two-em-dash
8885
# Word joiner isn't necessary if punctuation follows
8986
# Note the \p{{P}}. We must double-curl {} because that's the escape sequence when using .format(). The actual regex should be \p{P} to match punctuation
90-
processed_xhtml = regex.sub(r"([^\s{}{}{}])⸺".format(WORD_JOINER, NBSP, HAIR_SPACE), r"\1{}⸺".format(WORD_JOINER), processed_xhtml)
91-
processed_xhtml = regex.sub(r"⸺([^\s\p{{P}}{}])".format(WORD_JOINER), r"⸺{}\1".format(WORD_JOINER), processed_xhtml)
87+
processed_xhtml = regex.sub(r"([^\s{}{}{}])⸺".format(se.WORD_JOINER, se.NO_BREAK_SPACE, se.HAIR_SPACE), r"\1{}⸺".format(se.WORD_JOINER), processed_xhtml)
88+
processed_xhtml = regex.sub(r"⸺([^\s\p{{P}}{}])".format(se.WORD_JOINER), r"⸺{}\1".format(se.WORD_JOINER), processed_xhtml)
9289

9390
# Remove word joiners from following opening tags--they're usually never correct
94-
processed_xhtml = regex.sub(r"<([a-z]+)([^>]*?)>{}".format(WORD_JOINER), r"<\1\2>", processed_xhtml, flags=regex.IGNORECASE)
91+
processed_xhtml = regex.sub(r"<([a-z]+)([^>]*?)>{}".format(se.WORD_JOINER), r"<\1\2>", processed_xhtml, flags=regex.IGNORECASE)
9592

9693
# Finally fix some other mistakes
9794
processed_xhtml = processed_xhtml.replace("—-", "—")
9895

9996
# Replace Mr., Mrs., and other abbreviations, and include a non-breaking space
100-
processed_xhtml = regex.sub(r"\b(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.?\s+", r"\1.{}".format(NBSP), processed_xhtml)
101-
processed_xhtml = regex.sub(r"<abbr>(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.</abbr>?\s+", r"<abbr>\1.</abbr>{}".format(NBSP), processed_xhtml)
97+
processed_xhtml = regex.sub(r"\b(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.?\s+", r"\1.{}".format(se.NO_BREAK_SPACE), processed_xhtml)
98+
processed_xhtml = regex.sub(r"<abbr>(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.</abbr>?\s+", r"<abbr>\1.</abbr>{}".format(se.NO_BREAK_SPACE), processed_xhtml)
10299

103-
processed_xhtml = regex.sub(r"\bNo\.\s+([0-9]+)", r"No.{}\1".format(NBSP), processed_xhtml)
104-
processed_xhtml = regex.sub(r"<abbr>No\.</abbr>\s+", r"<abbr>No.</abbr>{}".format(NBSP), processed_xhtml)
100+
processed_xhtml = regex.sub(r"\bNo\.\s+([0-9]+)", r"No.{}\1".format(se.NO_BREAK_SPACE), processed_xhtml)
101+
processed_xhtml = regex.sub(r"<abbr>No\.</abbr>\s+", r"<abbr>No.</abbr>{}".format(se.NO_BREAK_SPACE), processed_xhtml)
105102

106-
processed_xhtml = regex.sub(r"([0-9]+)\s<abbr", r"\1{}<abbr".format(NBSP), processed_xhtml)
103+
processed_xhtml = regex.sub(r"([0-9]+)\s<abbr", r"\1{}<abbr".format(se.NO_BREAK_SPACE), processed_xhtml)
107104

108105
# A note on spacing:
109106
# ibooks kindle (mobi7)
@@ -130,9 +127,9 @@ def main():
130127
# nth (as in nth degree)
131128
processed_xhtml = regex.sub(r"\bn\-?th\b", r"<i>n</i>th", processed_xhtml)
132129

133-
# Remove double spaces that use nbsp for spacing
134-
processed_xhtml = regex.sub(r"{}[{} ]+".format(NBSP, NBSP), r" ", processed_xhtml)
135-
processed_xhtml = regex.sub(r" [{} ]+".format(NBSP), r" ", processed_xhtml)
130+
# Remove double spaces that use se.NO_BREAK_SPACE for spacing
131+
processed_xhtml = regex.sub(r"{}[{} ]+".format(se.NO_BREAK_SPACE, se.NO_BREAK_SPACE), r" ", processed_xhtml)
132+
processed_xhtml = regex.sub(r" [{} ]+".format(se.NO_BREAK_SPACE), r" ", processed_xhtml)
136133

137134
# House style: remove spacing from common Latinisms
138135
processed_xhtml = regex.sub(r"([Ii])\.\s+e\.", r"\1.e.", processed_xhtml)
@@ -143,40 +140,40 @@ def main():
143140
processed_xhtml = regex.sub(r"B\.\s+C\.", r"BC", processed_xhtml)
144141

145142
# Put spacing next to close quotes
146-
processed_xhtml = regex.sub(r"“[\s{}]*‘".format(NBSP), r"“{}‘".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
147-
processed_xhtml = regex.sub(r"’[\s{}]*”".format(NBSP), r"’{}”".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
148-
processed_xhtml = regex.sub(r"“[\s{}]*’".format(NBSP), r"“{}’".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
149-
processed_xhtml = regex.sub(r"‘[\s{}]*“".format(NBSP), r"‘{}“".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
143+
processed_xhtml = regex.sub(r"“[\s{}]*‘".format(se.NO_BREAK_SPACE), r"“{}‘".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
144+
processed_xhtml = regex.sub(r"’[\s{}]*”".format(se.NO_BREAK_SPACE), r"’{}”".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
145+
processed_xhtml = regex.sub(r"“[\s{}]*’".format(se.NO_BREAK_SPACE), r"“{}’".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
146+
processed_xhtml = regex.sub(r"‘[\s{}]*“".format(se.NO_BREAK_SPACE), r"‘{}“".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
150147

151148
# We require a non-letter char at the end, otherwise we might match a contraction: “Hello,” ’e said.
152-
processed_xhtml = regex.sub(r"”[\s{}]*’([^a-zA-Z])".format(NBSP), r"”{}’\1".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
149+
processed_xhtml = regex.sub(r"”[\s{}]*’([^a-zA-Z])".format(se.NO_BREAK_SPACE), r"”{}’\1".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
153150

154151
# Fix ellipses spacing
155152
processed_xhtml = regex.sub(r"\s*\.\s*\.\s*\.\s*", r"…", processed_xhtml, flags=regex.IGNORECASE)
156-
processed_xhtml = regex.sub(r"[\s{}]?…[\s{}]?\.".format(NBSP, NBSP), r".{}…".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
157-
processed_xhtml = regex.sub(r"[\s{}]?…[\s{}]?".format(NBSP, NBSP), r"{}… ".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
158-
processed_xhtml = regex.sub(r"<p([^>]*?)>{}…".format(HAIR_SPACE), r"<p\1>…", processed_xhtml, flags=regex.IGNORECASE)
153+
processed_xhtml = regex.sub(r"[\s{}]?…[\s{}]?\.".format(se.NO_BREAK_SPACE, se.NO_BREAK_SPACE), r".{}…".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
154+
processed_xhtml = regex.sub(r"[\s{}]?…[\s{}]?".format(se.NO_BREAK_SPACE, se.NO_BREAK_SPACE), r"{}… ".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
155+
processed_xhtml = regex.sub(r"<p([^>]*?)>{}…".format(se.HAIR_SPACE), r"<p\1>…", processed_xhtml, flags=regex.IGNORECASE)
159156

160157
# Remove spaces between opening tags and ellipses
161-
processed_xhtml = regex.sub(r"(<[a-z0-9]+[^<]+?>)[\s{}]?…".format(NBSP), r"\1…", processed_xhtml, flags=regex.IGNORECASE)
158+
processed_xhtml = regex.sub(r"(<[a-z0-9]+[^<]+?>)[\s{}]?…".format(se.NO_BREAK_SPACE), r"\1…", processed_xhtml, flags=regex.IGNORECASE)
162159

163160
# Remove spaces between closing tags and ellipses
164-
processed_xhtml = regex.sub(r"…[\s{}]?(</[a-z0-9]+>)".format(NBSP), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
165-
processed_xhtml = regex.sub(r"…[\s{}]+([\)”’])".format(NBSP), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
166-
processed_xhtml = regex.sub(r"([\(“‘])[\s{}]+…".format(NBSP), r"\1…", processed_xhtml, flags=regex.IGNORECASE)
167-
processed_xhtml = regex.sub(r"…[\s{}]?([\!\?\.\;\,])".format(NBSP), r"…{}\1".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
168-
processed_xhtml = regex.sub(r"([\!\?\.\;”’])[\s{}]?…".format(NBSP), r"\1{}…".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
169-
processed_xhtml = regex.sub(r"\,[\s{}]?…".format(NBSP), r",{}…".format(HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
161+
processed_xhtml = regex.sub(r"…[\s{}]?(</[a-z0-9]+>)".format(se.NO_BREAK_SPACE), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
162+
processed_xhtml = regex.sub(r"…[\s{}]+([\)”’])".format(se.NO_BREAK_SPACE), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
163+
processed_xhtml = regex.sub(r"([\(“‘])[\s{}]+…".format(se.NO_BREAK_SPACE), r"\1…", processed_xhtml, flags=regex.IGNORECASE)
164+
processed_xhtml = regex.sub(r"…[\s{}]?([\!\?\.\;\,])".format(se.NO_BREAK_SPACE), r"…{}\1".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
165+
processed_xhtml = regex.sub(r"([\!\?\.\;”’])[\s{}]?…".format(se.NO_BREAK_SPACE), r"\1{}…".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
166+
processed_xhtml = regex.sub(r"\,[\s{}]?…".format(se.NO_BREAK_SPACE), r",{}…".format(se.HAIR_SPACE), processed_xhtml, flags=regex.IGNORECASE)
170167

171168
# Remove spaces between ellipses and endnotes directly after
172-
processed_xhtml = regex.sub(r"…[\s{}]?(<a[^>]+?id=\"note-[0-9]+\"[^>]*?>)".format(NBSP), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
169+
processed_xhtml = regex.sub(r"…[\s{}]?(<a[^>]+?id=\"note-[0-9]+\"[^>]*?>)".format(se.NO_BREAK_SPACE), r"…\1", processed_xhtml, flags=regex.IGNORECASE)
173170

174171
# Add non-breaking spaces between amounts with an abbreviated unit. E.g. 8 oz., 10 lbs.
175-
processed_xhtml = regex.sub(r"([0-9])\s+([a-z]{1,3}\.)", r"\1{}\2".format(NBSP), processed_xhtml, flags=regex.IGNORECASE)
172+
processed_xhtml = regex.sub(r"([0-9])\s+([a-z]{1,3}\.)", r"\1{}\2".format(se.NO_BREAK_SPACE), processed_xhtml, flags=regex.IGNORECASE)
176173

177174
# Add non-breaking spaces between Arabic numbers and AM/PM
178-
processed_xhtml = regex.sub(r"([0-9])\s+([ap])\.m\.", r"\1{}\2.m.".format(NBSP), processed_xhtml, flags=regex.IGNORECASE)
179-
processed_xhtml = regex.sub(r"([0-9])\s+<abbr([^>]*?)>([ap])\.m\.", r"\1{}<abbr\2>\3.m.".format(NBSP), processed_xhtml, flags=regex.IGNORECASE)
175+
processed_xhtml = regex.sub(r"([0-9])\s+([ap])\.m\.", r"\1{}\2.m.".format(se.NO_BREAK_SPACE), processed_xhtml, flags=regex.IGNORECASE)
176+
processed_xhtml = regex.sub(r"([0-9])\s+<abbr([^>]*?)>([ap])\.m\.", r"\1{}<abbr\2>\3.m.".format(se.NO_BREAK_SPACE), processed_xhtml, flags=regex.IGNORECASE)
180177

181178
processed_xhtml = processed_xhtml.replace("Ph.D", "PhD")
182179
processed_xhtml = regex.sub(r"P\.\s*S\.", r"P.S.", processed_xhtml)
@@ -212,5 +209,6 @@ def main():
212209
if args.verbose:
213210
print(" OK")
214211

212+
215213
if __name__ == "__main__":
216214
main()

unicode-names

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import argparse
44
import sys
55
import unicodedata
66

7+
78
def main():
89
parser = argparse.ArgumentParser(description="Display Unicode code points, descriptions, and links to more details for each character in a string. Useful for differentiating between different flavors of spaces, dashes, and invisible characters like word joiners.")
910
parser.add_argument("strings", metavar="STRING", nargs="*", help="a Unicode string")
@@ -22,5 +23,6 @@ def main():
2223
for character in line:
2324
print(character + "\tU+{:04X}".format(ord(character)) + "\t" + unicodedata.name(character) + "\t" + "http://unicode.org/cldr/utility/character.jsp?a={:04X}".format(ord(character)))
2425

26+
2527
if __name__ == "__main__":
2628
main()

0 commit comments

Comments
 (0)