diff --git a/Cargo.lock b/Cargo.lock index 03a69fc5d76..66f43bbf7b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3135,6 +3135,7 @@ dependencies = [ "databake", "displaydoc", "postcard", + "potential_utf", "rand", "serde", "serde_json", diff --git a/provider/source/tests/dnametest.rs b/provider/source/tests/dnametest.rs new file mode 100644 index 00000000000..9e15171856c --- /dev/null +++ b/provider/source/tests/dnametest.rs @@ -0,0 +1,226 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use std::collections::{BTreeMap, BTreeSet}; + +use icu::locale::{ + fallback::{LocaleFallbackConfig, LocaleFallbackPriority, LocaleFallbacker}, + locale, LocaleExpander, +}; +use icu_experimental::displaynames::provider::RegionDisplayNamesV1; +use icu_provider::prelude::*; +use icu_provider_source::SourceDataProvider; +use litemap::LiteMap; +use ndarray::{Array2, Axis}; +use tinystr::TinyAsciiStr; +use zerotrie::ZeroTrieSimpleAscii; + +#[test] +fn dnametest() { + let provider = SourceDataProvider::new_custom() + .with_cldr(&std::path::PathBuf::from( + "/home/sffc/lib/cldr-46.0.0-json-full", + )) + .unwrap(); + + let locales: BTreeMap, usize> = + IterableDataProvider::::iter_ids(&provider) + .unwrap() + .into_iter() + .enumerate() + .map(|(i, v)| (v, i)) + .collect(); + + let payloads: BTreeMap> = locales + .keys() + .map(|locale| { + let payload = provider + .load(DataRequest { + id: locale.as_borrowed(), + ..Default::default() + }) + .unwrap() + .payload; + (locale.clone(), payload) + }) + .collect(); + + let unique_names: Vec<&str> = payloads + .values() + .flat_map(|v| v.get().names.iter_values()) + .collect::>() + .into_iter() + .collect(); + let unique_names_required_bits = (unique_names.len() as f64).log2().ceil() as usize; + println!("unique_names: {} ({unique_names_required_bits})", unique_names.len()); + + let regions: BTreeSet> = payloads + .get(&DataIdentifierCow::from_locale(locale!("en").into())) + .unwrap() + .get() + .names + .iter_keys() + .map(|s| s.try_into_tinystr().unwrap()) + .collect(); + + let expander = LocaleExpander::try_new_common_unstable(&provider).unwrap(); + let fallbacker = LocaleFallbacker::try_new_unstable(&provider).unwrap(); + let mut config = LocaleFallbackConfig::default(); + config.priority = LocaleFallbackPriority::Script; + let fallbacker = fallbacker.for_config(config); + + let script_locales: BTreeMap = locales + .keys() + .filter_map(|locale| { + let mut fallback_iterator = fallbacker.fallback_for(locale.locale); + loop { + let parent_locale = fallback_iterator.get(); + if parent_locale.is_unknown() { + println!("Didn't find script parent for: {:?}", locale.locale); + break None; + } + if parent_locale.language.is_unknown() && parent_locale.region.is_none() { + break Some(DataIdentifierCow::from_locale(*parent_locale)); + } + fallback_iterator.step(); + } + }) + .collect::>() // put in order + .into_iter() + .enumerate() + .map(|(a, b)| (b, a)) + .collect(); + + let mut dense_matrix = + Array2::>::default((locales.len() + script_locales.len(), regions.len())); + + for (i, (_locale, payload)) in payloads.iter().enumerate() { + for (j, region) in regions.iter().enumerate() { + if let Some(name) = payload.get().names.get(®ion.to_unvalidated()) { + let index = unique_names.binary_search(&name).unwrap(); + dense_matrix[(i, j)] = Some(index); + } + } + } + + for (i, script_locale) in script_locales.keys().enumerate() { + let i = i + locales.len(); + let mut locale = script_locale.locale.into_locale(); + expander.maximize(&mut locale.id); + expander.minimize_favor_script(&mut locale.id); + if let Some(k) = locales.get(&DataIdentifierCow::from_locale((&locale).into())) { + println!("Copying: {locale:?} to {:?}", script_locale.locale); + for (j, _region) in regions.iter().enumerate() { + dense_matrix[(i, j)] = dense_matrix[(*k, j)]; + } + } + } + + for (i, (locale, _payload)) in payloads.iter().enumerate() { + 'j: for (j, _region) in regions.iter().enumerate() { + let Some(value) = dense_matrix[(i, j)] else { + continue; + }; + let mut fallback_iterator = fallbacker.fallback_for(locale.locale); + loop { + fallback_iterator.step(); + let parent_locale = *fallback_iterator.get(); + if parent_locale.is_unknown() { + break; + } + if let Some(k) = locales + .get(&DataIdentifierCow::from_locale(parent_locale)) + .copied() + .or_else(|| { + script_locales + .get(&DataIdentifierCow::from_locale(parent_locale)) + .map(|x| x + locales.len()) + }) + { + if let Some(parent_value) = dense_matrix[(k, j)] { + if parent_value == value { + dense_matrix[(i, j)] = None; + } + continue 'j; + } + } + } + } + } + + let large_small = dense_matrix.map_axis(Axis(1), |values| { + values.iter().filter(|v| v.is_some()).count() + }); + + for (i, locale) in locales.keys().chain(script_locales.keys()).enumerate() { + println!("{locale:<3}: {}", large_small[i]); + } + + let locales_with_data: Vec = locales + .keys() + .chain(script_locales.keys()) + .enumerate() + .filter(|(i, _)| large_small[*i] != 0) + .map(|(_, locale)| locale.as_borrowed()) + .collect(); + println!("locales_with_data: {}", locales_with_data.len()); + + let locales_only_zerotrie: ZeroTrieSimpleAscii> = locales_with_data + .iter() + .enumerate() + .map(|(i, locale)| (locale.to_string(), i)) + .collect(); + println!("locales_only_zerotrie: {}", locales_only_zerotrie.byte_len()); + + let regions_only_zerotrie: ZeroTrieSimpleAscii> = regions.iter().enumerate() + .map(|(i, locale)| (locale.to_string(), i)) + .collect(); + + println!("regions_only_zerotrie: {}", regions_only_zerotrie.byte_len()); + + let sparse_map: LiteMap = locales + .keys() + .chain(script_locales.keys()) + .enumerate() + .flat_map(|(i, locale)| { + let dense_matrix = &dense_matrix; + regions.iter().enumerate().filter_map(move |(j, region)| { + dense_matrix[(i, j)].map(|index| (format!("{locale}/{region}"), index)) + }) + }) + .collect(); + println!("sparse_map: {}", sparse_map.len()); + + let sparse_zerotrie: ZeroTrieSimpleAscii> = + sparse_map.iter().map(|(k, v)| (k, *v)).collect(); + println!("sparse_zerotrie: {}", sparse_zerotrie.byte_len()); + + let dense_row_bit_size = regions.len() * unique_names_required_bits; + + let mut num_dense_locales = 0; + let hybrid_sparse_map: LiteMap = locales + .keys() + .chain(script_locales.keys()) + .enumerate() + .flat_map(|(i, locale)| { + let dense_matrix = &dense_matrix; + let row: Vec<(String, usize)> = regions.iter().enumerate().filter_map(move |(j, region)| { + dense_matrix[(i, j)].map(|index| (format!("{locale}/{region}"), index)) + }).collect(); + let inner_zerotrie: ZeroTrieSimpleAscii<_> = row.iter().map(|(k, v)| (k, *v)).collect(); + if inner_zerotrie.byte_len() * 8 > dense_row_bit_size { + num_dense_locales += 1; + vec![(locale.to_string(), 0)].into_iter() + } else { + row.into_iter() + } + }) + .collect(); + println!("hybrid_sparse_map: {}", hybrid_sparse_map.len()); + println!("num_dense_locales: {} ({} B)", num_dense_locales, num_dense_locales * dense_row_bit_size / 8); + + let hybrid_sparse_zerotrie: ZeroTrieSimpleAscii> = + hybrid_sparse_map.iter().map(|(k, v)| (k, *v)).collect(); + println!("hybrid_sparse_zerotrie: {}", hybrid_sparse_zerotrie.byte_len()); +} diff --git a/utils/tinystr/Cargo.toml b/utils/tinystr/Cargo.toml index f633493b4e4..10e66318fba 100644 --- a/utils/tinystr/Cargo.toml +++ b/utils/tinystr/Cargo.toml @@ -28,6 +28,7 @@ displaydoc = { workspace = true } serde = { workspace = true, features = ["alloc"], optional = true } zerovec = { workspace = true, optional = true } databake = { workspace = true, optional = true } +potential_utf = { workspace = true, optional = true } [dev-dependencies] bincode = { workspace = true } @@ -43,6 +44,7 @@ default = ["alloc"] alloc = ["zerovec?/alloc"] zerovec = ["dep:zerovec"] databake = ["dep:databake"] +potential_utf = ["dep:potential_utf"] serde = ["dep:serde"] # No longer does anything std = [] diff --git a/utils/tinystr/src/unvalidated.rs b/utils/tinystr/src/unvalidated.rs index 3758b64ff57..6b176545d35 100644 --- a/utils/tinystr/src/unvalidated.rs +++ b/utils/tinystr/src/unvalidated.rs @@ -38,6 +38,12 @@ impl UnvalidatedTinyAsciiStr { pub const fn from_utf8_unchecked(bytes: [u8; N]) -> Self { Self(bytes) } + + #[cfg(feature = "potential_utf")] + #[inline] + pub const fn as_potential_utf8(&self) -> &potential_utf::PotentialUtf8 { + potential_utf::PotentialUtf8::from_bytes(&self.0) + } } impl TinyAsciiStr { diff --git a/utils/writeable/src/impls.rs b/utils/writeable/src/impls.rs index 12f6d6d5dbb..a4d57ebc5bd 100644 --- a/utils/writeable/src/impls.rs +++ b/utils/writeable/src/impls.rs @@ -264,3 +264,39 @@ fn test_string_impls() { let arr: &[&String] = &[&String::new(), &"abc".to_owned()]; check_writeable_slice(arr); } + +macro_rules! impl_write_tuple { + ($($index:tt $ty:ident),+) => { + impl<$($ty),+> $crate::Writeable for ($($ty),+) where $($ty: $crate::Writeable),+ { + #[inline] + fn write_to(&self, sink: &mut W) -> fmt::Result { + $( + <$ty as $crate::Writeable>::write_to(&self.$index, sink)?; + )+ + Ok(()) + } + #[inline] + fn write_to_parts(&self, sink: &mut W) -> fmt::Result { + $( + <$ty as $crate::Writeable>::write_to_parts(&self.$index, sink)?; + )+ + Ok(()) + } + #[inline] + fn writeable_length_hint(&self) -> LengthHint { + let mut sum = LengthHint::exact(0); + $( + sum += <$ty as $crate::Writeable>::writeable_length_hint(&self.$index); + )+ + sum + } + } + }; +} + +impl_write_tuple!(0 A, 1 B); +impl_write_tuple!(0 A, 1 B, 2 C); +impl_write_tuple!(0 A, 1 B, 2 C, 3 D); +impl_write_tuple!(0 A, 1 B, 2 C, 3 D, 4 E); +impl_write_tuple!(0 A, 1 B, 2 C, 3 D, 4 E, 5 F); +impl_write_tuple!(0 A, 1 B, 2 C, 3 D, 4 E, 5 F, 6 G); diff --git a/utils/zerotrie/tests/dense_test.rs b/utils/zerotrie/tests/dense_test.rs new file mode 100644 index 00000000000..7637f6d61aa --- /dev/null +++ b/utils/zerotrie/tests/dense_test.rs @@ -0,0 +1,55 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use zerotrie::ZeroTrieSimpleAscii; +use zerovec::VarZeroVec; + +fn s2t<'a>(strings: impl Iterator) -> impl Iterator { + strings.enumerate().map(|(i, s)| (s, i)) +} + +#[test] +fn test_issue_sizes() { + static KEYS: &[&str] = &[ + "ar/FR", "ar/IR", "ar/SA", "ar/UK", "ar/US", "en/AU", "en/FR", "en/UK", "en/US", "fr/FR", + "fr/SA", "fr/UK", "fr/US", "it/IT", + ]; + + let sparse_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(KEYS.iter().copied())); + assert_eq!(sparse_zerotrie.byte_len(), 71); + + let sparse_vzv = VarZeroVec::::from(KEYS); + assert_eq!(sparse_vzv.as_bytes().len(), 98); + + static DENSE_LANGS: &[&str] = &["ar", "en", "fr"]; + static DENSE_REGIONS: &[&str] = &["FR", "SA", "UK", "US"]; + + let dense_lang_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(DENSE_LANGS.iter().copied())); + assert_eq!(dense_lang_zerotrie.byte_len(), 12); + + let dense_region_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(DENSE_REGIONS.iter().copied())); + assert_eq!(dense_region_zerotrie.byte_len(), 16); + + let hybrid_keys = KEYS + .iter() + .filter(|key| { + for lang in DENSE_LANGS.iter() { + for region in DENSE_REGIONS.iter() { + if writeable::cmp_str(&(lang, '/', region), key).is_eq() { + return false; + } + } + } + true + }) + .collect::>(); + assert_eq!(hybrid_keys.len(), 3); + + let hybrid_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(hybrid_keys + .iter() + .copied() + .chain(DENSE_LANGS.iter()) + .copied())); + assert_eq!(hybrid_zerotrie.byte_len(), 27); +}