Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

226 changes: 226 additions & 0 deletions provider/source/tests/dnametest.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use std::collections::{BTreeMap, BTreeSet};

use icu::locale::{
fallback::{LocaleFallbackConfig, LocaleFallbackPriority, LocaleFallbacker},
locale, LocaleExpander,
};
use icu_experimental::displaynames::provider::RegionDisplayNamesV1;
use icu_provider::prelude::*;
use icu_provider_source::SourceDataProvider;
use litemap::LiteMap;
use ndarray::{Array2, Axis};
use tinystr::TinyAsciiStr;
use zerotrie::ZeroTrieSimpleAscii;

#[test]
fn dnametest() {
let provider = SourceDataProvider::new_custom()
.with_cldr(&std::path::PathBuf::from(
"/home/sffc/lib/cldr-46.0.0-json-full",
))
.unwrap();

let locales: BTreeMap<DataIdentifierCow<'_>, usize> =
IterableDataProvider::<RegionDisplayNamesV1>::iter_ids(&provider)
.unwrap()
.into_iter()
.enumerate()
.map(|(i, v)| (v, i))
.collect();

let payloads: BTreeMap<DataIdentifierCow, DataPayload<RegionDisplayNamesV1>> = locales
.keys()
.map(|locale| {
let payload = provider
.load(DataRequest {
id: locale.as_borrowed(),
..Default::default()
})
.unwrap()
.payload;
(locale.clone(), payload)
})
.collect();

let unique_names: Vec<&str> = payloads
.values()
.flat_map(|v| v.get().names.iter_values())
.collect::<BTreeSet<_>>()
.into_iter()
.collect();
let unique_names_required_bits = (unique_names.len() as f64).log2().ceil() as usize;
println!("unique_names: {} ({unique_names_required_bits})", unique_names.len());

let regions: BTreeSet<TinyAsciiStr<3>> = payloads
.get(&DataIdentifierCow::from_locale(locale!("en").into()))
.unwrap()
.get()
.names
.iter_keys()
.map(|s| s.try_into_tinystr().unwrap())
.collect();

let expander = LocaleExpander::try_new_common_unstable(&provider).unwrap();
let fallbacker = LocaleFallbacker::try_new_unstable(&provider).unwrap();
let mut config = LocaleFallbackConfig::default();
config.priority = LocaleFallbackPriority::Script;
let fallbacker = fallbacker.for_config(config);

let script_locales: BTreeMap<DataIdentifierCow, usize> = locales
.keys()
.filter_map(|locale| {
let mut fallback_iterator = fallbacker.fallback_for(locale.locale);
loop {
let parent_locale = fallback_iterator.get();
if parent_locale.is_unknown() {
println!("Didn't find script parent for: {:?}", locale.locale);
break None;
}
if parent_locale.language.is_unknown() && parent_locale.region.is_none() {
break Some(DataIdentifierCow::from_locale(*parent_locale));
}
fallback_iterator.step();
}
})
.collect::<BTreeSet<_>>() // put in order
.into_iter()
.enumerate()
.map(|(a, b)| (b, a))
.collect();

let mut dense_matrix =
Array2::<Option<usize>>::default((locales.len() + script_locales.len(), regions.len()));

for (i, (_locale, payload)) in payloads.iter().enumerate() {
for (j, region) in regions.iter().enumerate() {
if let Some(name) = payload.get().names.get(&region.to_unvalidated()) {
let index = unique_names.binary_search(&name).unwrap();
dense_matrix[(i, j)] = Some(index);
}
}
}

for (i, script_locale) in script_locales.keys().enumerate() {
let i = i + locales.len();
let mut locale = script_locale.locale.into_locale();
expander.maximize(&mut locale.id);
expander.minimize_favor_script(&mut locale.id);
if let Some(k) = locales.get(&DataIdentifierCow::from_locale((&locale).into())) {
println!("Copying: {locale:?} to {:?}", script_locale.locale);
for (j, _region) in regions.iter().enumerate() {
dense_matrix[(i, j)] = dense_matrix[(*k, j)];
}
}
}

for (i, (locale, _payload)) in payloads.iter().enumerate() {
'j: for (j, _region) in regions.iter().enumerate() {
let Some(value) = dense_matrix[(i, j)] else {
continue;
};
let mut fallback_iterator = fallbacker.fallback_for(locale.locale);
loop {
fallback_iterator.step();
let parent_locale = *fallback_iterator.get();
if parent_locale.is_unknown() {
break;
}
if let Some(k) = locales
.get(&DataIdentifierCow::from_locale(parent_locale))
.copied()
.or_else(|| {
script_locales
.get(&DataIdentifierCow::from_locale(parent_locale))
.map(|x| x + locales.len())
})
{
if let Some(parent_value) = dense_matrix[(k, j)] {
if parent_value == value {
dense_matrix[(i, j)] = None;
}
continue 'j;
}
}
}
}
}

let large_small = dense_matrix.map_axis(Axis(1), |values| {
values.iter().filter(|v| v.is_some()).count()
});

for (i, locale) in locales.keys().chain(script_locales.keys()).enumerate() {
println!("{locale:<3}: {}", large_small[i]);
}

let locales_with_data: Vec<DataIdentifierBorrowed> = locales
.keys()
.chain(script_locales.keys())
.enumerate()
.filter(|(i, _)| large_small[*i] != 0)
.map(|(_, locale)| locale.as_borrowed())
.collect();
println!("locales_with_data: {}", locales_with_data.len());

let locales_only_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> = locales_with_data
.iter()
.enumerate()
.map(|(i, locale)| (locale.to_string(), i))
.collect();
println!("locales_only_zerotrie: {}", locales_only_zerotrie.byte_len());

let regions_only_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> = regions.iter().enumerate()
.map(|(i, locale)| (locale.to_string(), i))
.collect();

println!("regions_only_zerotrie: {}", regions_only_zerotrie.byte_len());

let sparse_map: LiteMap<String, usize> = locales
.keys()
.chain(script_locales.keys())
.enumerate()
.flat_map(|(i, locale)| {
let dense_matrix = &dense_matrix;
regions.iter().enumerate().filter_map(move |(j, region)| {
dense_matrix[(i, j)].map(|index| (format!("{locale}/{region}"), index))
})
})
.collect();
println!("sparse_map: {}", sparse_map.len());

let sparse_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> =
sparse_map.iter().map(|(k, v)| (k, *v)).collect();
println!("sparse_zerotrie: {}", sparse_zerotrie.byte_len());

let dense_row_bit_size = regions.len() * unique_names_required_bits;

let mut num_dense_locales = 0;
let hybrid_sparse_map: LiteMap<String, usize> = locales
.keys()
.chain(script_locales.keys())
.enumerate()
.flat_map(|(i, locale)| {
let dense_matrix = &dense_matrix;
let row: Vec<(String, usize)> = regions.iter().enumerate().filter_map(move |(j, region)| {
dense_matrix[(i, j)].map(|index| (format!("{locale}/{region}"), index))
}).collect();
let inner_zerotrie: ZeroTrieSimpleAscii<_> = row.iter().map(|(k, v)| (k, *v)).collect();
if inner_zerotrie.byte_len() * 8 > dense_row_bit_size {
num_dense_locales += 1;
vec![(locale.to_string(), 0)].into_iter()
} else {
row.into_iter()
}
})
.collect();
println!("hybrid_sparse_map: {}", hybrid_sparse_map.len());
println!("num_dense_locales: {} ({} B)", num_dense_locales, num_dense_locales * dense_row_bit_size / 8);

let hybrid_sparse_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> =
hybrid_sparse_map.iter().map(|(k, v)| (k, *v)).collect();
println!("hybrid_sparse_zerotrie: {}", hybrid_sparse_zerotrie.byte_len());
}
2 changes: 2 additions & 0 deletions utils/tinystr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ displaydoc = { workspace = true }
serde = { workspace = true, features = ["alloc"], optional = true }
zerovec = { workspace = true, optional = true }
databake = { workspace = true, optional = true }
potential_utf = { workspace = true, optional = true }

[dev-dependencies]
bincode = { workspace = true }
Expand All @@ -43,6 +44,7 @@ default = ["alloc"]
alloc = ["zerovec?/alloc"]
zerovec = ["dep:zerovec"]
databake = ["dep:databake"]
potential_utf = ["dep:potential_utf"]
serde = ["dep:serde"]
# No longer does anything
std = []
Expand Down
6 changes: 6 additions & 0 deletions utils/tinystr/src/unvalidated.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ impl<const N: usize> UnvalidatedTinyAsciiStr<N> {
pub const fn from_utf8_unchecked(bytes: [u8; N]) -> Self {
Self(bytes)
}

#[cfg(feature = "potential_utf")]
#[inline]
pub const fn as_potential_utf8(&self) -> &potential_utf::PotentialUtf8 {
potential_utf::PotentialUtf8::from_bytes(&self.0)
}
}

impl<const N: usize> TinyAsciiStr<N> {
Expand Down
36 changes: 36 additions & 0 deletions utils/writeable/src/impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,39 @@ fn test_string_impls() {
let arr: &[&String] = &[&String::new(), &"abc".to_owned()];
check_writeable_slice(arr);
}

macro_rules! impl_write_tuple {
($($index:tt $ty:ident),+) => {
impl<$($ty),+> $crate::Writeable for ($($ty),+) where $($ty: $crate::Writeable),+ {
#[inline]
fn write_to<W: fmt::Write + ?Sized>(&self, sink: &mut W) -> fmt::Result {
$(
<$ty as $crate::Writeable>::write_to(&self.$index, sink)?;
)+
Ok(())
}
#[inline]
fn write_to_parts<W: PartsWrite + ?Sized>(&self, sink: &mut W) -> fmt::Result {
$(
<$ty as $crate::Writeable>::write_to_parts(&self.$index, sink)?;
)+
Ok(())
}
#[inline]
fn writeable_length_hint(&self) -> LengthHint {
let mut sum = LengthHint::exact(0);
$(
sum += <$ty as $crate::Writeable>::writeable_length_hint(&self.$index);
)+
sum
}
}
};
}

impl_write_tuple!(0 A, 1 B);
impl_write_tuple!(0 A, 1 B, 2 C);
impl_write_tuple!(0 A, 1 B, 2 C, 3 D);
impl_write_tuple!(0 A, 1 B, 2 C, 3 D, 4 E);
impl_write_tuple!(0 A, 1 B, 2 C, 3 D, 4 E, 5 F);
impl_write_tuple!(0 A, 1 B, 2 C, 3 D, 4 E, 5 F, 6 G);
55 changes: 55 additions & 0 deletions utils/zerotrie/tests/dense_test.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use zerotrie::ZeroTrieSimpleAscii;
use zerovec::VarZeroVec;

fn s2t<'a>(strings: impl Iterator<Item = &'a str>) -> impl Iterator<Item = (&'a str, usize)> {
strings.enumerate().map(|(i, s)| (s, i))
}

#[test]
fn test_issue_sizes() {
static KEYS: &[&str] = &[
"ar/FR", "ar/IR", "ar/SA", "ar/UK", "ar/US", "en/AU", "en/FR", "en/UK", "en/US", "fr/FR",
"fr/SA", "fr/UK", "fr/US", "it/IT",
];

let sparse_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(KEYS.iter().copied()));
assert_eq!(sparse_zerotrie.byte_len(), 71);

let sparse_vzv = VarZeroVec::<str>::from(KEYS);
assert_eq!(sparse_vzv.as_bytes().len(), 98);

static DENSE_LANGS: &[&str] = &["ar", "en", "fr"];
static DENSE_REGIONS: &[&str] = &["FR", "SA", "UK", "US"];

let dense_lang_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(DENSE_LANGS.iter().copied()));
assert_eq!(dense_lang_zerotrie.byte_len(), 12);

let dense_region_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(DENSE_REGIONS.iter().copied()));
assert_eq!(dense_region_zerotrie.byte_len(), 16);

let hybrid_keys = KEYS
.iter()
.filter(|key| {
for lang in DENSE_LANGS.iter() {
for region in DENSE_REGIONS.iter() {
if writeable::cmp_str(&(lang, '/', region), key).is_eq() {
return false;
}
}
}
true
})
.collect::<Vec<_>>();
assert_eq!(hybrid_keys.len(), 3);

let hybrid_zerotrie = ZeroTrieSimpleAscii::from_iter(s2t(hybrid_keys
.iter()
.copied()
.chain(DENSE_LANGS.iter())
.copied()));
assert_eq!(hybrid_zerotrie.byte_len(), 27);
}
Loading