@@ -6,11 +6,8 @@ import fnmatch
6
6
import html
7
7
import regex
8
8
import smartypants
9
+ import se
9
10
10
- # Some convenience aliases
11
- WORD_JOINER = "\u2060 " # word joiner, U+2060
12
- HAIR_SPACE = "\u200a " # hair space, U+200A
13
- NBSP = "\u00a0 " # no-break space, U+00A0
14
11
15
12
def main ():
16
13
parser = argparse .ArgumentParser (description = "Apply some scriptable typography rules from the Standard Ebooks typography manual to XHTML files." )
@@ -63,47 +60,47 @@ def main():
63
60
processed_xhtml = regex .sub (r"—”([a-z])" , r"—“\1" , processed_xhtml , flags = regex .IGNORECASE )
64
61
processed_xhtml = regex .sub (r"—’([a-z])" , r"—‘\1" , processed_xhtml , flags = regex .IGNORECASE )
65
62
processed_xhtml = regex .sub (r"-“</p>" , r"—”</p>" , processed_xhtml , flags = regex .IGNORECASE )
66
- processed_xhtml = regex .sub (r"‘”</p>" , r"’{}”</p>" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
63
+ processed_xhtml = regex .sub (r"‘”</p>" , r"’{}”</p>" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
67
64
68
65
# Remove spaces between en and em dashes
69
66
# Note that we match at least one character before the dashes, so that we don't catch start-of-line em dashes like in poetry.
70
67
processed_xhtml = regex .sub (r"([^\.\s])\s*([–—])\s*" , r"\1\2" , processed_xhtml )
71
68
72
69
# First, remove stray word joiners
73
- processed_xhtml = processed_xhtml .replace (WORD_JOINER , "" )
70
+ processed_xhtml = processed_xhtml .replace (se . WORD_JOINER , "" )
74
71
75
72
# Some older texts use the ,— construct; remove that archaichism
76
73
processed_xhtml = processed_xhtml .replace (",—" , "—" )
77
74
78
75
# Em dashes and two-em-dashes can be broken before, so add a word joiner between letters/punctuation and the following em dash
79
- processed_xhtml = regex .sub (r"([^\s{}{}{}])([—⸻])" .format (WORD_JOINER , NBSP , HAIR_SPACE ), r"\1{}\2" .format (WORD_JOINER ), processed_xhtml , flags = regex .IGNORECASE )
76
+ processed_xhtml = regex .sub (r"([^\s{}{}{}])([—⸻])" .format (se . WORD_JOINER , se . NO_BREAK_SPACE , se . HAIR_SPACE ), r"\1{}\2" .format (se . WORD_JOINER ), processed_xhtml , flags = regex .IGNORECASE )
80
77
81
78
# Add en dashes; don't replace match that is within an html tag, since ids and attrs often containg the pattern DIGIT-DIGIT
82
79
processed_xhtml = regex .sub (r"(?<!<[^>]*)([0-9]+)\-([0-9]+)" , r"\1–\2" , processed_xhtml )
83
80
84
81
# Add a word joiner on both sides of en dashes
85
- processed_xhtml = regex .sub (r"{}?–{}?" .format (WORD_JOINER , WORD_JOINER ), r"{}–{}" .format (WORD_JOINER , WORD_JOINER ), processed_xhtml )
82
+ processed_xhtml = regex .sub (r"{}?–{}?" .format (se . WORD_JOINER , se . WORD_JOINER ), r"{}–{}" .format (se . WORD_JOINER , se . WORD_JOINER ), processed_xhtml )
86
83
87
84
# Add a word joiner if eliding a word with a two-em-dash
88
85
# Word joiner isn't necessary if punctuation follows
89
86
# Note the \p{{P}}. We must double-curl {} because that's the escape sequence when using .format(). The actual regex should be \p{P} to match punctuation
90
- processed_xhtml = regex .sub (r"([^\s{}{}{}])⸺" .format (WORD_JOINER , NBSP , HAIR_SPACE ), r"\1{}⸺" .format (WORD_JOINER ), processed_xhtml )
91
- processed_xhtml = regex .sub (r"⸺([^\s\p{{P}}{}])" .format (WORD_JOINER ), r"⸺{}\1" .format (WORD_JOINER ), processed_xhtml )
87
+ processed_xhtml = regex .sub (r"([^\s{}{}{}])⸺" .format (se . WORD_JOINER , se . NO_BREAK_SPACE , se . HAIR_SPACE ), r"\1{}⸺" .format (se . WORD_JOINER ), processed_xhtml )
88
+ processed_xhtml = regex .sub (r"⸺([^\s\p{{P}}{}])" .format (se . WORD_JOINER ), r"⸺{}\1" .format (se . WORD_JOINER ), processed_xhtml )
92
89
93
90
# Remove word joiners from following opening tags--they're usually never correct
94
- processed_xhtml = regex .sub (r"<([a-z]+)([^>]*?)>{}" .format (WORD_JOINER ), r"<\1\2>" , processed_xhtml , flags = regex .IGNORECASE )
91
+ processed_xhtml = regex .sub (r"<([a-z]+)([^>]*?)>{}" .format (se . WORD_JOINER ), r"<\1\2>" , processed_xhtml , flags = regex .IGNORECASE )
95
92
96
93
# Finally fix some other mistakes
97
94
processed_xhtml = processed_xhtml .replace ("—-" , "—" )
98
95
99
96
# Replace Mr., Mrs., and other abbreviations, and include a non-breaking space
100
- processed_xhtml = regex .sub (r"\b(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.?\s+" , r"\1.{}" .format (NBSP ), processed_xhtml )
101
- processed_xhtml = regex .sub (r"<abbr>(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.</abbr>?\s+" , r"<abbr>\1.</abbr>{}" .format (NBSP ), processed_xhtml )
97
+ processed_xhtml = regex .sub (r"\b(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.?\s+" , r"\1.{}" .format (se . NO_BREAK_SPACE ), processed_xhtml )
98
+ processed_xhtml = regex .sub (r"<abbr>(Mr|Mr?s|Drs?|Profs?|Lieut|Fr|Lt|Capt|Pvt|Esq|Mt|St|MM|Mmes?|Mlles?)\.</abbr>?\s+" , r"<abbr>\1.</abbr>{}" .format (se . NO_BREAK_SPACE ), processed_xhtml )
102
99
103
- processed_xhtml = regex .sub (r"\bNo\.\s+([0-9]+)" , r"No.{}\1" .format (NBSP ), processed_xhtml )
104
- processed_xhtml = regex .sub (r"<abbr>No\.</abbr>\s+" , r"<abbr>No.</abbr>{}" .format (NBSP ), processed_xhtml )
100
+ processed_xhtml = regex .sub (r"\bNo\.\s+([0-9]+)" , r"No.{}\1" .format (se . NO_BREAK_SPACE ), processed_xhtml )
101
+ processed_xhtml = regex .sub (r"<abbr>No\.</abbr>\s+" , r"<abbr>No.</abbr>{}" .format (se . NO_BREAK_SPACE ), processed_xhtml )
105
102
106
- processed_xhtml = regex .sub (r"([0-9]+)\s<abbr" , r"\1{}<abbr" .format (NBSP ), processed_xhtml )
103
+ processed_xhtml = regex .sub (r"([0-9]+)\s<abbr" , r"\1{}<abbr" .format (se . NO_BREAK_SPACE ), processed_xhtml )
107
104
108
105
# A note on spacing:
109
106
# ibooks kindle (mobi7)
@@ -130,9 +127,9 @@ def main():
130
127
# nth (as in nth degree)
131
128
processed_xhtml = regex .sub (r"\bn\-?th\b" , r"<i>n</i>th" , processed_xhtml )
132
129
133
- # Remove double spaces that use nbsp for spacing
134
- processed_xhtml = regex .sub (r"{}[{} ]+" .format (NBSP , NBSP ), r" " , processed_xhtml )
135
- processed_xhtml = regex .sub (r" [{} ]+" .format (NBSP ), r" " , processed_xhtml )
130
+ # Remove double spaces that use se.NO_BREAK_SPACE for spacing
131
+ processed_xhtml = regex .sub (r"{}[{} ]+" .format (se . NO_BREAK_SPACE , se . NO_BREAK_SPACE ), r" " , processed_xhtml )
132
+ processed_xhtml = regex .sub (r" [{} ]+" .format (se . NO_BREAK_SPACE ), r" " , processed_xhtml )
136
133
137
134
# House style: remove spacing from common Latinisms
138
135
processed_xhtml = regex .sub (r"([Ii])\.\s+e\." , r"\1.e." , processed_xhtml )
@@ -143,40 +140,40 @@ def main():
143
140
processed_xhtml = regex .sub (r"B\.\s+C\." , r"BC" , processed_xhtml )
144
141
145
142
# Put spacing next to close quotes
146
- processed_xhtml = regex .sub (r"“[\s{}]*‘" .format (NBSP ), r"“{}‘" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
147
- processed_xhtml = regex .sub (r"’[\s{}]*”" .format (NBSP ), r"’{}”" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
148
- processed_xhtml = regex .sub (r"“[\s{}]*’" .format (NBSP ), r"“{}’" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
149
- processed_xhtml = regex .sub (r"‘[\s{}]*“" .format (NBSP ), r"‘{}“" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
143
+ processed_xhtml = regex .sub (r"“[\s{}]*‘" .format (se . NO_BREAK_SPACE ), r"“{}‘" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
144
+ processed_xhtml = regex .sub (r"’[\s{}]*”" .format (se . NO_BREAK_SPACE ), r"’{}”" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
145
+ processed_xhtml = regex .sub (r"“[\s{}]*’" .format (se . NO_BREAK_SPACE ), r"“{}’" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
146
+ processed_xhtml = regex .sub (r"‘[\s{}]*“" .format (se . NO_BREAK_SPACE ), r"‘{}“" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
150
147
151
148
# We require a non-letter char at the end, otherwise we might match a contraction: “Hello,” ’e said.
152
- processed_xhtml = regex .sub (r"”[\s{}]*’([^a-zA-Z])" .format (NBSP ), r"”{}’\1" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
149
+ processed_xhtml = regex .sub (r"”[\s{}]*’([^a-zA-Z])" .format (se . NO_BREAK_SPACE ), r"”{}’\1" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
153
150
154
151
# Fix ellipses spacing
155
152
processed_xhtml = regex .sub (r"\s*\.\s*\.\s*\.\s*" , r"…" , processed_xhtml , flags = regex .IGNORECASE )
156
- processed_xhtml = regex .sub (r"[\s{}]?…[\s{}]?\." .format (NBSP , NBSP ), r".{}…" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
157
- processed_xhtml = regex .sub (r"[\s{}]?…[\s{}]?" .format (NBSP , NBSP ), r"{}… " .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
158
- processed_xhtml = regex .sub (r"<p([^>]*?)>{}…" .format (HAIR_SPACE ), r"<p\1>…" , processed_xhtml , flags = regex .IGNORECASE )
153
+ processed_xhtml = regex .sub (r"[\s{}]?…[\s{}]?\." .format (se . NO_BREAK_SPACE , se . NO_BREAK_SPACE ), r".{}…" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
154
+ processed_xhtml = regex .sub (r"[\s{}]?…[\s{}]?" .format (se . NO_BREAK_SPACE , se . NO_BREAK_SPACE ), r"{}… " .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
155
+ processed_xhtml = regex .sub (r"<p([^>]*?)>{}…" .format (se . HAIR_SPACE ), r"<p\1>…" , processed_xhtml , flags = regex .IGNORECASE )
159
156
160
157
# Remove spaces between opening tags and ellipses
161
- processed_xhtml = regex .sub (r"(<[a-z0-9]+[^<]+?>)[\s{}]?…" .format (NBSP ), r"\1…" , processed_xhtml , flags = regex .IGNORECASE )
158
+ processed_xhtml = regex .sub (r"(<[a-z0-9]+[^<]+?>)[\s{}]?…" .format (se . NO_BREAK_SPACE ), r"\1…" , processed_xhtml , flags = regex .IGNORECASE )
162
159
163
160
# Remove spaces between closing tags and ellipses
164
- processed_xhtml = regex .sub (r"…[\s{}]?(</[a-z0-9]+>)" .format (NBSP ), r"…\1" , processed_xhtml , flags = regex .IGNORECASE )
165
- processed_xhtml = regex .sub (r"…[\s{}]+([\)”’])" .format (NBSP ), r"…\1" , processed_xhtml , flags = regex .IGNORECASE )
166
- processed_xhtml = regex .sub (r"([\(“‘])[\s{}]+…" .format (NBSP ), r"\1…" , processed_xhtml , flags = regex .IGNORECASE )
167
- processed_xhtml = regex .sub (r"…[\s{}]?([\!\?\.\;\,])" .format (NBSP ), r"…{}\1" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
168
- processed_xhtml = regex .sub (r"([\!\?\.\;”’])[\s{}]?…" .format (NBSP ), r"\1{}…" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
169
- processed_xhtml = regex .sub (r"\,[\s{}]?…" .format (NBSP ), r",{}…" .format (HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
161
+ processed_xhtml = regex .sub (r"…[\s{}]?(</[a-z0-9]+>)" .format (se . NO_BREAK_SPACE ), r"…\1" , processed_xhtml , flags = regex .IGNORECASE )
162
+ processed_xhtml = regex .sub (r"…[\s{}]+([\)”’])" .format (se . NO_BREAK_SPACE ), r"…\1" , processed_xhtml , flags = regex .IGNORECASE )
163
+ processed_xhtml = regex .sub (r"([\(“‘])[\s{}]+…" .format (se . NO_BREAK_SPACE ), r"\1…" , processed_xhtml , flags = regex .IGNORECASE )
164
+ processed_xhtml = regex .sub (r"…[\s{}]?([\!\?\.\;\,])" .format (se . NO_BREAK_SPACE ), r"…{}\1" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
165
+ processed_xhtml = regex .sub (r"([\!\?\.\;”’])[\s{}]?…" .format (se . NO_BREAK_SPACE ), r"\1{}…" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
166
+ processed_xhtml = regex .sub (r"\,[\s{}]?…" .format (se . NO_BREAK_SPACE ), r",{}…" .format (se . HAIR_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
170
167
171
168
# Remove spaces between ellipses and endnotes directly after
172
- processed_xhtml = regex .sub (r"…[\s{}]?(<a[^>]+?id=\"note-[0-9]+\"[^>]*?>)" .format (NBSP ), r"…\1" , processed_xhtml , flags = regex .IGNORECASE )
169
+ processed_xhtml = regex .sub (r"…[\s{}]?(<a[^>]+?id=\"note-[0-9]+\"[^>]*?>)" .format (se . NO_BREAK_SPACE ), r"…\1" , processed_xhtml , flags = regex .IGNORECASE )
173
170
174
171
# Add non-breaking spaces between amounts with an abbreviated unit. E.g. 8 oz., 10 lbs.
175
- processed_xhtml = regex .sub (r"([0-9])\s+([a-z]{1,3}\.)" , r"\1{}\2" .format (NBSP ), processed_xhtml , flags = regex .IGNORECASE )
172
+ processed_xhtml = regex .sub (r"([0-9])\s+([a-z]{1,3}\.)" , r"\1{}\2" .format (se . NO_BREAK_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
176
173
177
174
# Add non-breaking spaces between Arabic numbers and AM/PM
178
- processed_xhtml = regex .sub (r"([0-9])\s+([ap])\.m\." , r"\1{}\2.m." .format (NBSP ), processed_xhtml , flags = regex .IGNORECASE )
179
- processed_xhtml = regex .sub (r"([0-9])\s+<abbr([^>]*?)>([ap])\.m\." , r"\1{}<abbr\2>\3.m." .format (NBSP ), processed_xhtml , flags = regex .IGNORECASE )
175
+ processed_xhtml = regex .sub (r"([0-9])\s+([ap])\.m\." , r"\1{}\2.m." .format (se . NO_BREAK_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
176
+ processed_xhtml = regex .sub (r"([0-9])\s+<abbr([^>]*?)>([ap])\.m\." , r"\1{}<abbr\2>\3.m." .format (se . NO_BREAK_SPACE ), processed_xhtml , flags = regex .IGNORECASE )
180
177
181
178
processed_xhtml = processed_xhtml .replace ("Ph.D" , "PhD" )
182
179
processed_xhtml = regex .sub (r"P\.\s*S\." , r"P.S." , processed_xhtml )
@@ -212,5 +209,6 @@ def main():
212
209
if args .verbose :
213
210
print (" OK" )
214
211
212
+
215
213
if __name__ == "__main__" :
216
214
main ()
0 commit comments