-
Notifications
You must be signed in to change notification settings - Fork 9
/
Regexps.py
209 lines (181 loc) · 10.1 KB
/
Regexps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
######################################################################
# regular expressions used by Zwiki
#
# Some people, when confronted with a problem, think 'I know, I'll use
# regular expressions.' Now they have two problems. --Jamie Zawinski
#
# Be brave. Read on.
#
# don't bother trying to keep to 80 char lines, here
import re, string
import Defaults
from Utils import BLATHER, DEBUG, formattedTraceback
# URLs/URIs (better regexps in urllib/urlparse ?)
urlchars = r'[A-Za-z0-9/:;@_%~#=&*\.\-\?\+\$,]+'
url = r'["=]?((about|gopher|http|https|ftp|mailto|file):%s)' % (urlchars)
# valid characters for zwiki page ids
# These are the characters which are used to form the ids of zwiki page
# objects. They must be legal in both zope ids and urls, and exclude _
# which is used for quoting. Cf http://zwiki.org/HowZwikiTitleAndIdWorks .
# It is possible to enable single-byte non-ascii letters in page ids here
# if you also hack bad_id in zope's OFS/ObjectManager.py, but then your
# urls would be illegal.
zwikiidcharsexpr = re.compile(r'[a-zA-Z0-9.-]')
# used in generating page ids
# XXX NB this is affected by locale - may not be what we want
spaceandlowerexpr = re.compile(r'\s+([%s])'%(string.lowercase))
# free-form wiki links
# zwiki uses three kinds of delimiters to enclose free-form wiki page names:
# single bracketed phrases (only) [...]
# what's inside the brackets should be group 1
singlebracketedexpr = r'\[(?:(?!\[))([^\n\]]+)\]'
# wikipedia-style double brackets [[...]]
doublebracketedexpr = r'\[\[([^\n\]]+)\]\]'
# wicked-style double parentheses ((...)), for international users whose
# keyboards make brackets hard to type
doubleparenthesisexpr = r'\(\(([^\n)]+)\)\)'
# match either single or double brackets, to simplify later regexps a little
bracketedexpr = r'\[\[?([^\n\]]+)\]\]?'
# match any kind of bracket or double parenthesis styles, group for sub()
bracketmatch = re.compile(r'(\[\[?|\(\()([^\n\]]+)(\]\]?|\)\))')
# bare wikinames
#
# "bare wikinames" here means page names which will be automatically
# wiki-linked without needing to be enclosed in brackets. Zwiki's
# wikinames are standard c2.com-style CamelCase plus the following
# additions:
#
# - words of a single letter are allowed (APage, PageA)
#
# - trailing digits are allowed (PageVersion22, but not Page22Version)
# (XXX for simplicity and to reduce unexpected wikilinking, eg of big
# random texts. Change this ?)
#
# - non-ascii letters are allowed. We aim to as far as possible work as
# international users would expect, out of the box and regardless of
# python version, locale setting, platform etc. Better ideas are
# welcome. One of two possible setups is chosen at startup:
#
# 1. if a system locale is configured, the locale's letters are allowed
#
# We include these in our regexps below. We need them utf8-encoded
# since zwiki text is always stored utf8-encoded. So we convert them
# from the system's default encoding to unicode and re-encode as
# utf8. It's hard to see how to do this robustly on all systems and
# it has been the cause of many zwiki startup problems, so we must be
# careful not to let any error stop product initialisation (#769,
# #1158). Other notes: don't rely on python 2.3's
# getpreferredencoding, gives wrong answer; work around a python bug
# with some locales (#392).
#
# 2. if no system locale is configured or there was an error during the
# above, a default set of non-ascii letters are allowed
#
# On systems where we can't detect the locale's characters, we jump
# through some hoops to support a number of non-ascii letters common
# in latin and other languages so things are more likely to "just
# work" for more users.
# we'll set up the following strings to use when building the regexps:
# U: 'A|B|C|... '
# L: 'a|b|c|...'
# Ubr: '[ABC...]'
# Lbr: '[abc...]'
# where A, b etc. are the utf8-encoded upper & lower-case letters.
try:
import locale
lang, encoding = locale.getlocale()
U = '|'.join([c.encode('utf8') for c in unicode(string.uppercase, encoding)])
L = '|'.join([c.encode('utf8') for c in unicode(string.lowercase, encoding)])
Ubr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(string.uppercase, encoding)])
Lbr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(string.lowercase, encoding)])
relocaleflag = r'(?L)'
wordboundary = r'\b'
except (TypeError,LookupError):
# no locale is set, or there was a problem detecting it or a
# problem decoding its letters.
# XXX must be a less ugly way to do this:
# if it's just that there's no locale, don't log a warning
try: lang, encoding = locale.getlocale()
except locale.Error: lang, encoding = -1,-1
if (lang, encoding) == (None, None): pass
else:
BLATHER('the system locale gave a problem in Regexps.py, so WikiNames will not be locale-aware')
DEBUG(formattedTraceback())
# define a useful default set of non-ascii letters, mainly european letters
# from http://zwiki.org/InternationalCharacterExamples
# XXX more have been added to that page (latvian, polish).. how far
# should we go with this ? Could we make it always recognise all
# letters and forget locale awareness ? Are regexps getting slow ?
# XXX needs more work, see failing links at
# http://zwiki.org/InternationalCharactersInPageNames
uppercase = string.uppercase + '\xc3\x80\xc3\x81\xc3\x82\xc3\x83\xc3\x84\xc3\x85\xc3\x86\xc3\x88\xc3\x89\xc3\x8a\xc3\x8b\xc3\x8c\xc3\x8d\xc3\x8e\xc3\x8f\xc3\x92\xc3\x93\xc3\x94\xc3\x95\xc3\x96\xc3\x98\xc3\x99\xc3\x9a\xc3\x9b\xc3\x9c\xc3\x9d\xc3\x87\xc3\x90\xc3\x91\xc3\x9e'
lowercase = string.lowercase + '\xc3\xa0\xc3\xa1\xc3\xa2\xc3\xa3\xc3\xa4\xc3\xa5\xc3\xa6\xc3\xa8\xc3\xa9\xc3\xaa\xc3\xab\xc3\xac\xc3\xad\xc3\xae\xc3\xaf\xc3\xb2\xc3\xb3\xc3\xb4\xc3\xb5\xc3\xb6\xc3\xb8\xc3\xb9\xc3\xba\xc3\xbb\xc3\xbc\xc3\xbd\xc3\xbf\xc2\xb5\xc3\x9f\xc3\xa7\xc3\xb0\xc3\xb1\xc3\xbe'
U = '|'.join([c.encode('utf8') for c in unicode(uppercase,'utf-8')])
L = '|'.join([c.encode('utf8') for c in unicode(lowercase,'utf-8')])
Ubr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(uppercase,'utf-8')])
Lbr = '[%s]' % ''.join([c.encode('utf8') for c in unicode(lowercase,'utf-8')])
relocaleflag = ''
wordboundary = '(?<![A-Za-z0-9\x80-\xff])'
# the basic bare wikiname regexps
# ?: means "don't remember", apparently a performance optimization
wikiname1 = r'%s%s(?:%s)+(?:%s)+(?:%s)(?:%s|%s)*[0-9]*' % (relocaleflag,wordboundary,U,L,U,U,L)
wikiname2 = r'%s%s(?:%s)(?:%s)+(?:%s)(?:%s|%s)*[0-9]*' % (relocaleflag,wordboundary,U,U,L,U,L)
# are we having fun yet ?
# don't match things like →
# could also do this in markLinksIn and make it per-pagetype ?
wikiname3 = r'(?:%s|%s)' % (wikiname1, wikiname2)
# is there a reason for the following regexp ?
# I think no ampersand before or an ampersand but no char/; behind is enough
# cautiously commented --StefanRank
# more trouble: the XML spec also allows &---WikiName---;
#wikiname4 = r'(?:(?<!&)%s|(?<=&)%s(?![%s;]))' % (wikiname3, wikiname3, U+L)
wikiname4 = r'(?:(?<!&)%s(?![%s])|(?<=&)%s(?![%s;]))' % (wikiname3, U+L, wikiname3, U+L)
wikiname = r'!?(%s)' %(wikiname4)
# issue number links XXX should be in the tracker plugin ?
simplehashnumber = r'\#[0-9]+'
# avoid html entities like {
hashnumberexpr = r'(?:(?<!&)%s|(?<=&)%s(?![0-9;]))' % (simplehashnumber, simplehashnumber)
wikilink = r'!?(%s|%s|%s|%s|%s)' % (wikiname4,bracketedexpr,doubleparenthesisexpr,url,hashnumberexpr)
localwikilink1 = r'(?:%s|%s|%s|%s)' % (wikiname4,bracketedexpr,doubleparenthesisexpr,hashnumberexpr)
localwikilink = r'!?(%s)' % (localwikilink1)
interwikilink = r'!?((?P<local>%s):(?P<remote>\w%s))' % (localwikilink1,urlchars) # local wiki link, a colon, one word char, one or more url chars
anywikilinkexpr = re.compile(r'(%s|%s)' % (interwikilink,wikilink))
markedwikilinkexpr = re.compile(r'<zwiki>(.*?)</zwiki>')
untitledwikilinkexpr = re.compile(r'<a href="([^"/]*/)*(?P<page>[^/"]*)" title="">.*?</a>')
wikinamewords = r'((%s(?!%s))+|%s%s+|[0-9]+)'%(Ubr,Lbr,Ubr,Lbr)
remotewikiurl = r'(?mi)RemoteWikiURL[:\s]+(?P<remoteurl>[^\s]*)\s*$'
protected_line = r'(?m)^!(.*)$'
# stx footnotes
# handled by us so as to co-exist with our bracketed links
# real stx allows refchars = r'[0-9_%s-]' % (string.letters)
footnoteexpr = r'(?sm)^\.\. \[([^\n\]]+)\]'
# for stripping javascript
# XXX needs work, eg should not match
# <input... name="ZPythonScriptHTML_editAction:method">
javascriptexpr = r'(?iL)<(([^>\w]*script|iframe)[^>]*)>' # \1 will be displayed
# for stripping HTML header/footer
# XXX these are expensive, may hit max recursion limit on bsd
htmlheaderexpr = r'(?si)^(\s*<!doctype.*?)?\s*<html.*?<body.*?>'
htmlfooterexpr = r'(?si)</body.*?>\s*</html.*?>\s*$'
# better ? safe ?
htmlbodyexpr = r'(?si)^.*?<body[^>]*?>(.*)</body[^>]*?>.*?$'
# one more badass regexp:
# sgml tags, including ones containing dtml & python expressions and multiline
# Notes:
# - r'(?s)<((".*?")|[^">]+)*>' takes exponential time
# - r'(?s)<((".*?")|[^">]+(?![^">]))*>' avoids backtracking (see perlre)
# - to avoid matching casual angle bracket use, treat dtml separately
# - recognising that stuff like <!-- dtml-var ...> & </dtml ...> is also dtml
# - and that a simple sgml tag may contain a dtml tag
# - put dtml pattern first, longest match does not apply with (|) I think
# - not perfect, but seems to match everything in practice
dtmlentity = r'(?i)&dtml.*?;'
dtmltag = r'(?si)<[-/! ]*dtml((".*?")|[^">]+(?![^">]))*>'
tagchars = r'[%s0-9\.\=\'\"\:\/\-\#\+\s\*\;\!\&\-]' % string.letters
sgmltag = r'<((".*?")|(%s)|%s+(?!%s))>' % (dtmltag,tagchars,tagchars)
dtmlorsgmlexpr = r'(%s|%s|%s)' % (dtmltag,sgmltag,dtmlentity)
# From_ separator used to recognize rfc2822 messages - regexp from mailbox.py
# used in Comments.py
fromlineexpr = r'(?m)(?:^|\n\n)From \s*[^\s]+\s+\w\w\w\s+\w\w\w\s+\d?\d\s+\d?\d:\d\d(:\d\d)?(\s+[^\s]+)?\s+\d\d\d\d\s*$'
# purple numbers NIDs XXX should be in purplenumbers plugin ?
nidexpr = r'\s*({nid (?P<nid>[0-9A-z]+?)})'