Skip to content

Commit 78da21b

Browse files
refeedtimabbott
authored andcommitted
lib/url.py: Strengthen URL escaping.
Use the same implementation of how zulip sanitizes its stream name and URL into a safe URL. Closes #35
1 parent 6d188a8 commit 78da21b

File tree

5 files changed

+41
-30
lines changed

5 files changed

+41
-30
lines changed

lib/html.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from .url import (
1919
sanitize_stream,
20-
sanitize_topic,
20+
sanitize,
2121
)
2222

2323
from .url import (
@@ -87,7 +87,7 @@ def format_message_html(
8787
site_url,
8888
html_root,
8989
sanitize_stream(stream_name, stream_id),
90-
sanitize_topic(topic_name),
90+
sanitize(topic_name),
9191
msg_id,
9292
)
9393
anchor_html = '<a name="{0}"></a>'.format(html.escape(msg_id))
@@ -184,7 +184,7 @@ def topic_list_html(topic_data):
184184
"""
185185

186186
def item_html(topic_name, message_data):
187-
link_html = f'<a href="topic/{html.escape(sanitize_topic(topic_name))}.html">{html.escape(topic_name)}</a>'
187+
link_html = f'<a href="topic/{html.escape(sanitize(topic_name))}.html">{html.escape(topic_name)}</a>'
188188
topic_info = topic_info_string(message_data)
189189
return f"<li> {link_html} ({html.escape(topic_info)}) </li>"
190190

lib/populate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
)
5151
from .url import (
5252
sanitize_stream,
53-
sanitize_topic,
53+
sanitize,
5454
)
5555

5656

@@ -222,7 +222,7 @@ def populate_incremental(
222222
p = (
223223
json_root
224224
/ Path(sanitize_stream(s["name"], s["stream_id"]))
225-
/ Path(sanitize_topic(topic_name) + ".json")
225+
/ Path(sanitize(topic_name) + ".json")
226226
)
227227
topic_exists = p.exists()
228228
old = []
@@ -257,7 +257,7 @@ def dump_topic_messages(json_root, stream_data, topic_name, message_data):
257257
sanitized_stream_name = sanitize_stream(stream_name, stream_id)
258258
stream_dir = json_root / Path(sanitized_stream_name)
259259

260-
sanitized_topic_name = sanitize_topic(topic_name)
260+
sanitized_topic_name = sanitize(topic_name)
261261
topic_fn = sanitized_topic_name + ".json"
262262

263263
out = open_outfile(stream_dir, topic_fn, "w")

lib/url.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -66,29 +66,25 @@ def archive_message_url(
6666

6767
## String cleaning functions
6868

69-
# remove non-alnum ascii symbols from string
70-
def sanitize(s):
71-
return (
72-
"".join(
73-
filter(
74-
lambda x: x.isalnum or x == " ",
75-
s.encode("ascii", "ignore").decode("utf-8"),
76-
)
77-
)
78-
.replace(" ", "-")
79-
.replace("?", "%3F")
80-
)
8169

70+
def sanitize(s):
71+
"""
72+
Sanitize the string to a safe string that can be used in URLs
8273
83-
# create a unique sanitized identifier for a topic
84-
def sanitize_topic(topic_name):
85-
return (
86-
urllib.parse.quote(topic_name, safe="~()*!.'")
87-
.replace(".", "%2E")
88-
.replace("%", ".")
89-
)
74+
This is copied from Zulip's core code:
75+
https://github.com/zulip/zulip/blob/de31114d700561f32139a63a0e5f33d5c30039b3/zerver/lib/url_encoding.py#L8
76+
"""
77+
return urllib.parse.quote(s, safe=b"").replace(".", "%2E").replace("%", ".")
9078

9179

9280
# create a unique sanitized identifier for a stream
9381
def sanitize_stream(stream_name, stream_id):
82+
"""
83+
Encode streams for urls as something like 99-Foo-bar.
84+
85+
This is copied from Zulip's core code:
86+
https://github.com/zulip/zulip/blob/de31114d700561f32139a63a0e5f33d5c30039b3/zerver/lib/url_encoding.py#L15
87+
"""
88+
89+
stream_name = stream_name.replace(" ", "-")
9490
return str(stream_id) + "-" + sanitize(stream_name)

lib/website.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
from .url import (
2323
sanitize_stream,
24-
sanitize_topic,
24+
sanitize,
2525
)
2626

2727
from .files import (
@@ -219,7 +219,7 @@ def write_topic_messages(
219219
stream_id = stream["id"]
220220

221221
sanitized_stream_name = sanitize_stream(stream_name, stream_id)
222-
sanitized_topic_name = sanitize_topic(topic_name)
222+
sanitized_topic_name = sanitize(topic_name)
223223

224224
messages = read_zulip_messages_for_topic(
225225
json_root, sanitized_stream_name, sanitized_topic_name

tests/testCommon.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,26 @@ def assert_equal(v1, v2):
2424

2525

2626
def test_sanitize():
27-
assert_equal(url.sanitize_stream(stream_name="foo bar", stream_id=7), "7-foo-bar")
27+
assert_equal(
28+
url.sanitize_stream(stream_name="foo bar", stream_id=7),
29+
"7-foo-bar",
30+
)
31+
assert_equal(
32+
url.sanitize_stream(stream_name="foo/bar/turtle[🐢]", stream_id=7),
33+
"7-foo.2Fbar.2Fturtle.5B.F0.9F.90.A2.5D",
34+
)
2835

2936
assert_equal(
30-
url.sanitize_topic(topic_name="pick a place for lunch"),
31-
"pick.20a.20place.20for.20lunch",
37+
url.sanitize("pick a place for lunch *"),
38+
"pick.20a.20place.20for.20lunch.20.2A",
39+
)
40+
assert_equal(
41+
url.sanitize("!!cute-turlte/tortoise (🐢)?"),
42+
".21.21cute-turlte.2Ftortoise.20.28.F0.9F.90.A2.29.3F",
43+
)
44+
assert_equal(
45+
url.sanitize('"the mighty turtle 🐢"'),
46+
".22the.20mighty.20turtle.20.F0.9F.90.A2.22",
3247
)
3348

3449

0 commit comments

Comments
 (0)