diff --git a/Cargo.lock b/Cargo.lock index 623cca7..346c4d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -238,6 +238,17 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +[[package]] +name = "hashify" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f208758247e68e239acaa059e72e4ce1f30f2a4b6523f19c1b923d25b7e9cceb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.22", +] + [[package]] name = "hermit-abi" version = "0.3.1" @@ -250,6 +261,7 @@ version = "1.0.5" dependencies = [ "assert2", "criterion", + "hashify", "iai", "matchgen", "memchr", diff --git a/Cargo.toml b/Cargo.toml index a24e3d8..2c0904c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ serde_json = { version = "1.0", optional = true } memchr = "2.5.0" paste = "1.0.11" phf = { version = "0.11.1", default-features = false, optional = true } +hashify = "0.2.6" [dev-dependencies] assert2 = "0.3.7" diff --git a/build.rs b/build.rs index 655977e..bedaa85 100644 --- a/build.rs +++ b/build.rs @@ -52,6 +52,8 @@ fn generate_entities_rs(entities: &[(String, String)]) { /// Entity | Codepoints | Glyph\n\ /// -------------------------------|--------------------|------").unwrap(); + let mut hashify = String::new(); + let mut map_builder = phf_codegen::Map::<&[u8]>::new(); let mut max_len: usize = 0; let mut min_len: usize = usize::MAX; @@ -60,6 +62,15 @@ fn generate_entities_rs(entities: &[(String, String)]) { max_len = max(max_len, name.len()); min_len = min(min_len, name.len()); + { + use std::fmt::Write; + write!(&mut hashify, "\n b\"{name}\" => &[").unwrap(); + for &byte in glyph.as_bytes() { + write!(&mut hashify, "{byte},").unwrap(); + } + write!(&mut hashify, "],").unwrap(); + } + // `{:28}` would pad the output inside the backticks. let name = format!("`{name}`"); @@ -84,15 +95,23 @@ fn generate_entities_rs(entities: &[(String, String)]) { let map = map_builder.build(); writeln!( out, - "\ - #[allow(clippy::unreadable_literal)]\n\ - pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};\n\ - \n\ - /// Length of longest entity including ‘&’ and possibly ‘;’.\n\ - pub const ENTITY_MAX_LENGTH: usize = {max_len};\n\ - \n\ - /// Length of shortest entity including ‘&’ and possibly ‘;’.\n\ - pub const ENTITY_MIN_LENGTH: usize = {min_len};" + r#"#[allow(clippy::unreadable_literal)] +pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map}; + +/// Length of longest entity including ‘&’ and possibly ‘;’. +pub const ENTITY_MAX_LENGTH: usize = {max_len}; + +/// Length of shortest entity including ‘&’ and possibly ‘;’. +pub const ENTITY_MIN_LENGTH: usize = {min_len}; + +/// Get an unescaped character by its HTML entity +pub(crate) fn get_entity(candidate: &[u8]) -> Option<&[u8]> {{ + hashify::map! {{ + candidate, + &[u8],{hashify} + }} +}} +"# ) .unwrap(); } diff --git a/src/unescape.rs b/src/unescape.rs index 9417f85..a7854ac 100644 --- a/src/unescape.rs +++ b/src/unescape.rs @@ -366,7 +366,7 @@ fn match_entity_slow<'a>( iter: &'a mut slice::Iter, context: Context, ) -> Option> { - use crate::{ENTITIES, ENTITY_MAX_LENGTH, ENTITY_MIN_LENGTH}; + use crate::{get_entity, ENTITY_MAX_LENGTH, ENTITY_MIN_LENGTH}; use std::cmp::min; assert_peek_eq(iter, Some(b'&'), "match_entity() expected '&'"); @@ -450,7 +450,7 @@ fn match_entity_slow<'a>( // See `unescape_in()` documentation for examples. // // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state - if let Some(&expansion) = ENTITIES.get(candidate) { + if let Some(expansion) = get_entity(candidate) { *iter = candidate_iter; return Some(expansion.into()); } @@ -458,7 +458,7 @@ fn match_entity_slow<'a>( // Find longest matching entity. let max_len = min(candidate.len(), ENTITY_MAX_LENGTH); for check_len in (ENTITY_MIN_LENGTH..=max_len).rev() { - if let Some(&expansion) = ENTITIES.get(&candidate[..check_len]) { + if let Some(expansion) = get_entity(&candidate[..check_len]) { // Found a match. check_len starts at ENTITY_MIN_LENGTH, which // must always be greater than 0, so `check_len - 1` is safe. debug_assert!(check_len >= 1);