Skip to content

Switch to hashify phf, gain extra 20-30% unescape speed-up #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ serde_json = { version = "1.0", optional = true }
memchr = "2.5.0"
paste = "1.0.11"
phf = { version = "0.11.1", default-features = false, optional = true }
hashify = "0.2.6"

[dev-dependencies]
assert2 = "0.3.7"
Expand Down
37 changes: 28 additions & 9 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
/// Entity | Codepoints | Glyph\n\
/// -------------------------------|--------------------|------").unwrap();

let mut hashify = String::new();

let mut map_builder = phf_codegen::Map::<&[u8]>::new();
let mut max_len: usize = 0;
let mut min_len: usize = usize::MAX;
Expand All @@ -60,6 +62,15 @@
max_len = max(max_len, name.len());
min_len = min(min_len, name.len());

{
use std::fmt::Write;
write!(&mut hashify, "\n b\"{name}\" => &[").unwrap();
for &byte in glyph.as_bytes() {
write!(&mut hashify, "{byte},").unwrap();
}
write!(&mut hashify, "],").unwrap();
}

// `{:28}` would pad the output inside the backticks.
let name = format!("`{name}`");

Expand All @@ -84,15 +95,23 @@
let map = map_builder.build();
writeln!(
out,
"\
#[allow(clippy::unreadable_literal)]\n\
pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};\n\
\n\
/// Length of longest entity including ‘&’ and possibly ‘;’.\n\
pub const ENTITY_MAX_LENGTH: usize = {max_len};\n\
\n\
/// Length of shortest entity including ‘&’ and possibly ‘;’.\n\
pub const ENTITY_MIN_LENGTH: usize = {min_len};"
r#"#[allow(clippy::unreadable_literal)]

Check failure on line 98 in build.rs

View workflow job for this annotation

GitHub Actions / cargo clippy

unnecessary hashes around raw string literal
pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};

/// Length of longest entity including ‘&’ and possibly ‘;’.
pub const ENTITY_MAX_LENGTH: usize = {max_len};

/// Length of shortest entity including ‘&’ and possibly ‘;’.
pub const ENTITY_MIN_LENGTH: usize = {min_len};

/// Get an unescaped character by its HTML entity
pub(crate) fn get_entity(candidate: &[u8]) -> Option<&[u8]> {{
hashify::map! {{
candidate,
&[u8],{hashify}
}}
}}
"#
)
.unwrap();
}
Expand Down
6 changes: 3 additions & 3 deletions src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ fn match_entity_slow<'a>(
iter: &'a mut slice::Iter<u8>,
context: Context,
) -> Option<Cow<'a, [u8]>> {
use crate::{ENTITIES, ENTITY_MAX_LENGTH, ENTITY_MIN_LENGTH};
use crate::{get_entity, ENTITY_MAX_LENGTH, ENTITY_MIN_LENGTH};
use std::cmp::min;

assert_peek_eq(iter, Some(b'&'), "match_entity() expected '&'");
Expand Down Expand Up @@ -450,15 +450,15 @@ fn match_entity_slow<'a>(
// See `unescape_in()` documentation for examples.
//
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
if let Some(&expansion) = ENTITIES.get(candidate) {
if let Some(expansion) = get_entity(candidate) {
*iter = candidate_iter;
return Some(expansion.into());
}
} else {
// Find longest matching entity.
let max_len = min(candidate.len(), ENTITY_MAX_LENGTH);
for check_len in (ENTITY_MIN_LENGTH..=max_len).rev() {
if let Some(&expansion) = ENTITIES.get(&candidate[..check_len]) {
if let Some(expansion) = get_entity(&candidate[..check_len]) {
// Found a match. check_len starts at ENTITY_MIN_LENGTH, which
// must always be greater than 0, so `check_len - 1` is safe.
debug_assert!(check_len >= 1);
Expand Down
Loading