19
19
20
20
import fnmatch
21
21
import io
22
- from lxml import etree
23
22
import re
24
23
import os
25
- import sys
26
24
import shutil
27
25
import urllib .parse
28
- from xml_utils import xml_escape , xml_unescape
26
+ from lxml import etree
27
+
29
28
30
29
def rmtree_if_exists (dir ):
31
30
if os .path .isdir (dir ):
32
31
shutil .rmtree (dir )
33
32
33
+
34
34
def move_dir_contents_to_dir (srcdir , dstdir ):
35
35
for fn in os .listdir (srcdir ):
36
36
shutil .move (os .path .join (srcdir , fn ),
37
37
os .path .join (dstdir , fn ))
38
38
39
+
39
40
def rearrange_archive (root ):
40
41
# rearrange the archive. {root} here is output/reference
41
42
@@ -71,8 +72,10 @@ def rearrange_archive(root):
71
72
move_dir_contents_to_dir (src_data_path , data_path )
72
73
73
74
# also copy the custom fonts
74
- shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed60.ttf' ), data_path )
75
- shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed75.ttf' ), data_path )
75
+ shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed60.ttf' ),
76
+ data_path )
77
+ shutil .copy (os .path .join (path , 'DejaVuSansMonoCondensed75.ttf' ),
78
+ data_path )
76
79
77
80
# remove what's left
78
81
shutil .rmtree (path )
@@ -81,20 +84,23 @@ def rearrange_archive(root):
81
84
for fn in fnmatch .filter (os .listdir (root ), 'cppreference-export*.xml' ):
82
85
os .remove (os .path .join (root , fn ))
83
86
84
- # Converts complex URL to resources supplied by MediaWiki loader to a simplified name
87
+
85
88
def convert_loader_name (fn ):
89
+ # Converts complex URL to resources supplied by MediaWiki loader to a
90
+ # simplified name
86
91
if "modules=site&only=scripts" in fn :
87
92
return "site_scripts.js"
88
- elif "modules=site&only=styles" in fn :
93
+ if "modules=site&only=styles" in fn :
89
94
return "site_modules.css"
90
- elif "modules=startup&only=scripts" in fn :
95
+ if "modules=startup&only=scripts" in fn :
91
96
return "startup_scripts.js"
92
- elif re .search ("modules=skins.*&only=scripts" , fn ):
97
+ if re .search ("modules=skins.*&only=scripts" , fn ):
93
98
return "skin_scripts.js"
94
- elif re .search ("modules=.*ext.*&only=styles" , fn ):
99
+ if re .search ("modules=.*ext.*&only=styles" , fn ):
95
100
return "ext.css"
96
- else :
97
- raise Exception ('Loader file {0} does not match any known files' .format (fn ))
101
+ msg = 'Loader file {0} does not match any known files' .format (fn )
102
+ raise Exception (msg )
103
+
98
104
99
105
def build_rename_map (root ):
100
106
# Returns a rename map: a map from old to new file name
@@ -122,13 +128,17 @@ def build_rename_map(root):
122
128
if num > 0 :
123
129
name , ext = os .path .splitext (fn )
124
130
# add file with its path -> only rename that occurrence
125
- result [os .path .join (dir , fn )] = "{}.{}{}" .format (name , num + 1 , ext )
131
+ result [os .path .join (dir , fn )] = "{}.{}{}" .format (name , num + 1 ,
132
+ ext )
126
133
seen [low ] += 1
127
134
128
135
return result
129
136
137
+
130
138
def rename_files (root , rename_map ):
131
- for dir , old_fn in ((dir , fn ) for dir , _ , filenames in os .walk (root ) for fn in filenames ):
139
+ for dir , old_fn in ((dir , fn )
140
+ for dir , _ , filenames in os .walk (root )
141
+ for fn in filenames ):
132
142
src_path = os .path .join (dir , old_fn )
133
143
134
144
new_fn = rename_map .get (old_fn )
@@ -144,6 +154,7 @@ def rename_files(root, rename_map):
144
154
print ("Renaming {0}\n to {1}" .format (src_path , dst_path ))
145
155
shutil .move (src_path , dst_path )
146
156
157
+
147
158
def find_html_files (root ):
148
159
# find files that need to be preprocessed
149
160
html_files = []
@@ -152,21 +163,25 @@ def find_html_files(root):
152
163
html_files .append (os .path .join (dir , filename ))
153
164
return html_files
154
165
166
+
155
167
def is_loader_link (target ):
156
168
if re .match (r'https?://[a-z]+\.cppreference\.com/mwiki/load\.php' , target ):
157
169
return True
158
170
return False
159
171
172
+
160
173
def transform_loader_link (target , file , root ):
161
174
# Absolute loader.php links need to be made relative
162
175
abstarget = os .path .join (root , "common" , convert_loader_name (target ))
163
176
return os .path .relpath (abstarget , os .path .dirname (file ))
164
177
178
+
165
179
def is_ranges_placeholder (target ):
166
- if re .match (r'https?://[a-z]+\.cppreference\.com/w/cpp/ranges(-[a-z]+)?-placeholder/.+' , target ):
180
+ if re .match (r'https?://[a-z]+\.cppreference\.com/w/cpp/ranges(-[a-z]+)?-placeholder/.+' , target ): # noqa
167
181
return True
168
182
return False
169
183
184
+
170
185
def transform_ranges_placeholder (target , file , root ):
171
186
# Placeholder link replacement is implemented in the MediaWiki site JS at
172
187
# https://en.cppreference.com/w/MediaWiki:Common.js
@@ -175,9 +190,9 @@ def transform_ranges_placeholder(target, file, root):
175
190
repl = (r'\1/cpp/experimental/ranges/\2' if ranges else r'\1/cpp/\2' )
176
191
177
192
if 'ranges-placeholder' in target :
178
- match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-placeholder/(.+)'
193
+ match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-placeholder/(.+)' # noqa
179
194
else :
180
- match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-([a-z]+)-placeholder/(.+)'
195
+ match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-([a-z]+)-placeholder/(.+)' # noqa
181
196
repl += (r'/\3' if ranges else r'/ranges/\3' )
182
197
183
198
# Turn absolute placeholder link into site-relative link
@@ -187,24 +202,27 @@ def transform_ranges_placeholder(target, file, root):
187
202
abstarget = os .path .join (root , reltarget )
188
203
return os .path .relpath (abstarget , os .path .dirname (file ))
189
204
205
+
190
206
def is_external_link (target ):
191
207
url = urllib .parse .urlparse (target )
192
208
return url .scheme != '' or url .netloc != ''
193
209
210
+
194
211
def trasform_relative_link (rename_map , target , file ):
195
212
# urlparse returns (scheme, host, path, params, query, fragment)
196
213
_ , _ , path , params , _ , fragment = urllib .parse .urlparse (target )
197
214
assert params == ''
198
215
199
216
path = urllib .parse .unquote (path )
200
- path = path .replace ('../../upload.cppreference.com/mwiki/' ,'../common/' )
201
- path = path .replace ('../mwiki/' ,'../common/' )
217
+ path = path .replace ('../../upload.cppreference.com/mwiki/' , '../common/' )
218
+ path = path .replace ('../mwiki/' , '../common/' )
202
219
203
220
dir , fn = os .path .split (path )
204
221
new_fn = rename_map .get (fn )
205
222
if new_fn :
206
223
# look for case conflict of the renamed file
207
- abstarget = os .path .normpath (os .path .join (os .path .dirname (file ), dir , new_fn ))
224
+ abstarget = os .path .normpath (os .path .join (os .path .dirname (file ),
225
+ dir , new_fn ))
208
226
new_fn = rename_map .get (abstarget , new_fn )
209
227
else :
210
228
# original filename unchanged, look for case conflict
@@ -216,11 +234,13 @@ def trasform_relative_link(rename_map, target, file):
216
234
path = urllib .parse .quote (path )
217
235
return urllib .parse .urlunparse (('' , '' , path , params , '' , fragment ))
218
236
237
+
219
238
# Transforms a link in the given file according to rename map.
220
239
# target is the link to transform.
221
240
# file is the path of the file the link came from.
222
241
# root is the path to the root of the archive.
223
242
def transform_link (rename_map , target , file , root ):
243
+
224
244
if is_loader_link (target ):
225
245
return transform_loader_link (target , file , root )
226
246
@@ -232,6 +252,7 @@ def transform_link(rename_map, target, file, root):
232
252
233
253
return trasform_relative_link (rename_map , target , file )
234
254
255
+
235
256
def has_class (el , * classes_to_check ):
236
257
value = el .get ('class' )
237
258
if value is None :
@@ -242,6 +263,7 @@ def has_class(el, *classes_to_check):
242
263
return True
243
264
return False
244
265
266
+
245
267
# remove non-printable elements
246
268
def remove_noprint (html ):
247
269
for el in html .xpath ('//*' ):
@@ -250,14 +272,16 @@ def remove_noprint(html):
250
272
elif el .get ('id' ) in ['toc' , 'catlinks' ]:
251
273
el .getparent ().remove (el )
252
274
275
+
253
276
# remove see also links between C and C++ documentations
254
277
def remove_see_also (html ):
255
278
for el in html .xpath ('//tr[@class]' ):
256
279
if not has_class (el , 't-dcl-list-item' , 't-dsc' ):
257
280
continue
258
281
259
282
child_tds = el .xpath ('.//td/div[@class]' )
260
- if not any (has_class (td , 't-dcl-list-see' , 't-dsc-see' ) for td in child_tds ):
283
+ if not any (has_class (td , 't-dcl-list-see' , 't-dsc-see' )
284
+ for td in child_tds ):
261
285
continue
262
286
263
287
# remove preceding separator, if any
@@ -276,17 +300,23 @@ def remove_see_also(html):
276
300
next = el .getnext ()
277
301
if next is None :
278
302
el .getparent ().remove (el )
279
- elif next .tag == 'table' and has_class (next , 't-dcl-list-begin' ) and len (next .xpath ('.//tr' )) == 0 :
303
+ elif next .tag == 'table' and has_class (next , 't-dcl-list-begin' ) and \
304
+ len (next .xpath ('.//tr' )) == 0 :
280
305
el .getparent ().remove (el )
281
306
next .getparent ().remove (next )
282
307
308
+
283
309
# remove Google Analytics scripts
284
310
def remove_google_analytics (html ):
285
311
for el in html .xpath ('/html/body/script' ):
286
- if el .get ('src' ) is not None and 'google-analytics.com/ga.js' in el .get ('src' ):
287
- el .getparent ().remove (el )
288
- elif el .text is not None and ('google-analytics.com/ga.js' in el .text or 'pageTracker' in el .text ):
289
- el .getparent ().remove (el )
312
+ if el .get ('src' ) is not None :
313
+ if 'google-analytics.com/ga.js' in el .get ('src' ):
314
+ el .getparent ().remove (el )
315
+ elif el .text is not None :
316
+ if 'google-analytics.com/ga.js' in el .text or \
317
+ 'pageTracker' in el .text :
318
+ el .getparent ().remove (el )
319
+
290
320
291
321
# remove Carbon ads
292
322
def remove_ads (html ):
@@ -297,13 +327,15 @@ def remove_ads(html):
297
327
if el .text is not None and '#carbonads' in el .text :
298
328
el .getparent ().remove (el )
299
329
330
+
300
331
# remove links to file info pages (e.g. on images)
301
332
def remove_fileinfo (html ):
302
- info = etree .XPath (r"//a[re:test(@href, 'https?://[a-z]+\.cppreference\.com/w/File:')]/.." ,
303
- namespaces = {'re' :'http://exslt.org/regular-expressions' })
333
+ info = etree .XPath (r"//a[re:test(@href, 'https?://[a-z]+\.cppreference\.com/w/File:')]/.." , # noqa
334
+ namespaces = {'re' :'http://exslt.org/regular-expressions' }) # noqa
304
335
for el in info (html ):
305
336
el .getparent ().remove (el )
306
337
338
+
307
339
# remove external links to unused resources
308
340
def remove_unused_external (html ):
309
341
for el in html .xpath ('/html/head/link' ):
@@ -313,6 +345,7 @@ def remove_unused_external(html):
313
345
(head , tail ) = os .path .split (el .get ('href' ))
314
346
el .set ('href' , os .path .join (head , 'common' , tail ))
315
347
348
+
316
349
def preprocess_html_file (root , fn , rename_map ):
317
350
parser = etree .HTMLParser ()
318
351
html = etree .parse (fn , parser )
@@ -331,23 +364,27 @@ def preprocess_html_file(root, fn, rename_map):
331
364
for el in html .xpath ('//*[@href]' ):
332
365
el .set ('href' , transform_link (rename_map , el .get ('href' ), fn , root ))
333
366
334
- for err in parser .error_log :
367
+ for err in list ( parser .error_log ) :
335
368
print ("HTML WARN: {0}" .format (err ), file = output )
336
369
337
370
html .write (fn , encoding = 'utf-8' , method = 'html' )
338
371
return output .getvalue ()
339
372
373
+
340
374
def preprocess_css_file (fn ):
341
375
f = open (fn , "r" , encoding = 'utf-8' )
342
376
text = f .read ()
343
377
f .close ()
344
378
345
379
# note that query string is not used in css files
346
380
347
- text = text .replace ('../DejaVuSansMonoCondensed60.ttf' , 'DejaVuSansMonoCondensed60.ttf' )
348
- text = text .replace ('../DejaVuSansMonoCondensed75.ttf' , 'DejaVuSansMonoCondensed75.ttf' )
381
+ text = text .replace ('../DejaVuSansMonoCondensed60.ttf' ,
382
+ 'DejaVuSansMonoCondensed60.ttf' )
383
+ text = text .replace ('../DejaVuSansMonoCondensed75.ttf' ,
384
+ 'DejaVuSansMonoCondensed75.ttf' )
349
385
350
- text = text .replace ('../../upload.cppreference.com/mwiki/images/' , 'images/' )
386
+ text = text .replace ('../../upload.cppreference.com/mwiki/images/' ,
387
+ 'images/' )
351
388
352
389
# QT Help viewer doesn't understand nth-child
353
390
text = text .replace ('nth-child(1)' , 'first-child' )
@@ -356,6 +393,7 @@ def preprocess_css_file(fn):
356
393
f .write (text )
357
394
f .close ()
358
395
396
+
359
397
def preprocess_startup_script (fn ):
360
398
with open (fn , "r" , encoding = 'utf-8' ) as f :
361
399
text = f .read ()
0 commit comments