Skip to content

Commit 980f916

Browse files
committed
Switch to hashify phf, gain extra 20-30% unescape speed-up
See #82 for details.
1 parent f87d1b4 commit 980f916

File tree

4 files changed

+45
-12
lines changed

4 files changed

+45
-12
lines changed

Cargo.lock

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ serde_json = { version = "1.0", optional = true }
3939
memchr = "2.5.0"
4040
paste = "1.0.11"
4141
phf = { version = "0.11.1", default-features = false, optional = true }
42+
hashify = "0.2.6"
4243

4344
[dev-dependencies]
4445
assert2 = "0.3.7"

build.rs

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ fn generate_entities_rs(entities: &[(String, String)]) {
5252
/// Entity | Codepoints | Glyph\n\
5353
/// -------------------------------|--------------------|------").unwrap();
5454

55+
let mut hashify = String::new();
56+
5557
let mut map_builder = phf_codegen::Map::<&[u8]>::new();
5658
let mut max_len: usize = 0;
5759
let mut min_len: usize = usize::MAX;
@@ -60,6 +62,15 @@ fn generate_entities_rs(entities: &[(String, String)]) {
6062
max_len = max(max_len, name.len());
6163
min_len = min(min_len, name.len());
6264

65+
{
66+
use std::fmt::Write;
67+
write!(&mut hashify, "\n b\"{name}\" => &[").unwrap();
68+
for &byte in glyph.as_bytes() {
69+
write!(&mut hashify, "{byte},").unwrap();
70+
}
71+
write!(&mut hashify, "],").unwrap();
72+
}
73+
6374
// `{:28}` would pad the output inside the backticks.
6475
let name = format!("`{name}`");
6576

@@ -84,15 +95,23 @@ fn generate_entities_rs(entities: &[(String, String)]) {
8495
let map = map_builder.build();
8596
writeln!(
8697
out,
87-
"\
88-
#[allow(clippy::unreadable_literal)]\n\
89-
pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};\n\
90-
\n\
91-
/// Length of longest entity including ‘&’ and possibly ‘;’.\n\
92-
pub const ENTITY_MAX_LENGTH: usize = {max_len};\n\
93-
\n\
94-
/// Length of shortest entity including ‘&’ and possibly ‘;’.\n\
95-
pub const ENTITY_MIN_LENGTH: usize = {min_len};"
98+
r#"#[allow(clippy::unreadable_literal)]
99+
pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};
100+
101+
/// Length of longest entity including ‘&’ and possibly ‘;’.
102+
pub const ENTITY_MAX_LENGTH: usize = {max_len};
103+
104+
/// Length of shortest entity including ‘&’ and possibly ‘;’.
105+
pub const ENTITY_MIN_LENGTH: usize = {min_len};
106+
107+
/// Get an unescaped character by its HTML entity
108+
pub(crate) fn get_entity(candidate: &[u8]) -> Option<&[u8]> {{
109+
hashify::map! {{
110+
candidate,
111+
&[u8],{hashify}
112+
}}
113+
}}
114+
"#
96115
)
97116
.unwrap();
98117
}

src/unescape.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
//! in the public API, but it still builds the slow versions so that all
1818
//! functions can be tested.
1919
20+
use crate::get_entity;
2021
use paste::paste;
2122
use std::borrow::Cow;
2223
use std::char;
@@ -366,7 +367,7 @@ fn match_entity_slow<'a>(
366367
iter: &'a mut slice::Iter<u8>,
367368
context: Context,
368369
) -> Option<Cow<'a, [u8]>> {
369-
use crate::{ENTITIES, ENTITY_MAX_LENGTH, ENTITY_MIN_LENGTH};
370+
use crate::{ENTITY_MAX_LENGTH, ENTITY_MIN_LENGTH};
370371
use std::cmp::min;
371372

372373
assert_peek_eq(iter, Some(b'&'), "match_entity() expected '&'");
@@ -450,15 +451,15 @@ fn match_entity_slow<'a>(
450451
// See `unescape_in()` documentation for examples.
451452
//
452453
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
453-
if let Some(&expansion) = ENTITIES.get(candidate) {
454+
if let Some(expansion) = get_entity(candidate) {
454455
*iter = candidate_iter;
455456
return Some(expansion.into());
456457
}
457458
} else {
458459
// Find longest matching entity.
459460
let max_len = min(candidate.len(), ENTITY_MAX_LENGTH);
460461
for check_len in (ENTITY_MIN_LENGTH..=max_len).rev() {
461-
if let Some(&expansion) = ENTITIES.get(&candidate[..check_len]) {
462+
if let Some(expansion) = get_entity(&candidate[..check_len]) {
462463
// Found a match. check_len starts at ENTITY_MIN_LENGTH, which
463464
// must always be greater than 0, so `check_len - 1` is safe.
464465
debug_assert!(check_len >= 1);

0 commit comments

Comments
 (0)