@@ -11,12 +11,18 @@ use icu::locale::{
1111use icu_experimental:: displaynames:: provider:: RegionDisplayNamesV1 ;
1212use icu_provider:: prelude:: * ;
1313use icu_provider_source:: SourceDataProvider ;
14+ use litemap:: LiteMap ;
1415use ndarray:: { Array2 , Axis } ;
1516use tinystr:: TinyAsciiStr ;
17+ use zerotrie:: ZeroTrieSimpleAscii ;
1618
1719#[ test]
1820fn dnametest ( ) {
19- let provider = SourceDataProvider :: new ( ) ;
21+ let provider = SourceDataProvider :: new_custom ( )
22+ . with_cldr ( & std:: path:: PathBuf :: from (
23+ "/home/sffc/lib/cldr-46.0.0-json-full" ,
24+ ) )
25+ . unwrap ( ) ;
2026
2127 let locales: BTreeMap < DataIdentifierCow < ' _ > , usize > =
2228 IterableDataProvider :: < RegionDisplayNamesV1 > :: iter_ids ( & provider)
@@ -40,16 +46,23 @@ fn dnametest() {
4046 } )
4147 . collect ( ) ;
4248
43- let en_names = payloads
44- . get ( & DataIdentifierCow :: from_locale ( locale ! ( "en" ) . into ( ) ) )
45- . unwrap ( ) ;
49+ let unique_names: Vec < & str > = payloads
50+ . values ( )
51+ . flat_map ( |v| v. get ( ) . names . iter_values ( ) )
52+ . collect :: < BTreeSet < _ > > ( )
53+ . into_iter ( )
54+ . collect ( ) ;
55+ let unique_names_required_bits = ( unique_names. len ( ) as f64 ) . log2 ( ) . ceil ( ) as usize ;
56+ println ! ( "unique_names: {} ({unique_names_required_bits})" , unique_names. len( ) ) ;
4657
47- let regions = en_names
58+ let regions: BTreeSet < TinyAsciiStr < 3 > > = payloads
59+ . get ( & DataIdentifierCow :: from_locale ( locale ! ( "en" ) . into ( ) ) )
60+ . unwrap ( )
4861 . get ( )
4962 . names
5063 . iter_keys ( )
5164 . map ( |s| s. try_into_tinystr ( ) . unwrap ( ) )
52- . collect :: < BTreeSet < TinyAsciiStr < 3 > > > ( ) ;
65+ . collect ( ) ;
5366
5467 let expander = LocaleExpander :: try_new_common_unstable ( & provider) . unwrap ( ) ;
5568 let fallbacker = LocaleFallbacker :: try_new_unstable ( & provider) . unwrap ( ) ;
@@ -80,11 +93,14 @@ fn dnametest() {
8093 . collect ( ) ;
8194
8295 let mut dense_matrix =
83- Array2 :: < Option < & str > > :: default ( ( locales. len ( ) + script_locales. len ( ) , regions. len ( ) ) ) ;
96+ Array2 :: < Option < usize > > :: default ( ( locales. len ( ) + script_locales. len ( ) , regions. len ( ) ) ) ;
8497
8598 for ( i, ( _locale, payload) ) in payloads. iter ( ) . enumerate ( ) {
8699 for ( j, region) in regions. iter ( ) . enumerate ( ) {
87- dense_matrix[ ( i, j) ] = payload. get ( ) . names . get ( & region. to_unvalidated ( ) ) ;
100+ if let Some ( name) = payload. get ( ) . names . get ( & region. to_unvalidated ( ) ) {
101+ let index = unique_names. binary_search ( & name) . unwrap ( ) ;
102+ dense_matrix[ ( i, j) ] = Some ( index) ;
103+ }
88104 }
89105 }
90106
@@ -137,11 +153,66 @@ fn dnametest() {
137153 values. iter ( ) . filter ( |v| v. is_some ( ) ) . count ( )
138154 } ) ;
139155
140- for ( i, locale) in locales. keys ( ) . enumerate ( ) {
141- println ! ( "{locale:<3}: {}" , large_small[ i] ) ;
142- }
143- for ( i, locale) in script_locales. keys ( ) . enumerate ( ) {
144- let i = i + locales. len ( ) ;
156+ for ( i, locale) in locales. keys ( ) . chain ( script_locales. keys ( ) ) . enumerate ( ) {
145157 println ! ( "{locale:<3}: {}" , large_small[ i] ) ;
146158 }
159+
160+ let locales_only_zerotrie: ZeroTrieSimpleAscii < Vec < u8 > > = locales
161+ . keys ( )
162+ . chain ( script_locales. keys ( ) )
163+ . enumerate ( )
164+ . map ( |( i, locale) | ( locale. to_string ( ) , i) )
165+ . collect ( ) ;
166+ println ! ( "locales_only_zerotrie: {}" , locales_only_zerotrie. byte_len( ) ) ;
167+
168+ let regions_only_zerotrie: ZeroTrieSimpleAscii < Vec < u8 > > = regions. iter ( ) . enumerate ( )
169+ . map ( |( i, locale) | ( locale. to_string ( ) , i) )
170+ . collect ( ) ;
171+
172+ println ! ( "regions_only_zerotrie: {}" , regions_only_zerotrie. byte_len( ) ) ;
173+
174+ let sparse_map: LiteMap < String , usize > = locales
175+ . keys ( )
176+ . chain ( script_locales. keys ( ) )
177+ . enumerate ( )
178+ . flat_map ( |( i, locale) | {
179+ let dense_matrix = & dense_matrix;
180+ regions. iter ( ) . enumerate ( ) . filter_map ( move |( j, region) | {
181+ dense_matrix[ ( i, j) ] . map ( |index| ( format ! ( "{locale}/{region}" ) , index) )
182+ } )
183+ } )
184+ . collect ( ) ;
185+ println ! ( "sparse_map: {}" , sparse_map. len( ) ) ;
186+
187+ let sparse_zerotrie: ZeroTrieSimpleAscii < Vec < u8 > > =
188+ sparse_map. iter ( ) . map ( |( k, v) | ( k, * v) ) . collect ( ) ;
189+ println ! ( "sparse_zerotrie: {}" , sparse_zerotrie. byte_len( ) ) ;
190+
191+ let dense_row_bit_size = regions. len ( ) * unique_names_required_bits;
192+
193+ let mut num_dense_locales = 0 ;
194+ let hybrid_sparse_map: LiteMap < String , usize > = locales
195+ . keys ( )
196+ . chain ( script_locales. keys ( ) )
197+ . enumerate ( )
198+ . flat_map ( |( i, locale) | {
199+ let dense_matrix = & dense_matrix;
200+ let row: Vec < ( String , usize ) > = regions. iter ( ) . enumerate ( ) . filter_map ( move |( j, region) | {
201+ dense_matrix[ ( i, j) ] . map ( |index| ( format ! ( "{locale}/{region}" ) , index) )
202+ } ) . collect ( ) ;
203+ let inner_zerotrie: ZeroTrieSimpleAscii < _ > = row. iter ( ) . map ( |( k, v) | ( k, * v) ) . collect ( ) ;
204+ if inner_zerotrie. byte_len ( ) * 8 > dense_row_bit_size {
205+ num_dense_locales += 1 ;
206+ vec ! [ ( locale. to_string( ) , 0 ) ] . into_iter ( )
207+ } else {
208+ row. into_iter ( )
209+ }
210+ } )
211+ . collect ( ) ;
212+ println ! ( "hybrid_sparse_map: {}" , hybrid_sparse_map. len( ) ) ;
213+ println ! ( "num_dense_locales: {} ({} B)" , num_dense_locales, num_dense_locales * dense_row_bit_size / 8 ) ;
214+
215+ let hybrid_sparse_zerotrie: ZeroTrieSimpleAscii < Vec < u8 > > =
216+ hybrid_sparse_map. iter ( ) . map ( |( k, v) | ( k, * v) ) . collect ( ) ;
217+ println ! ( "hybrid_sparse_zerotrie: {}" , hybrid_sparse_zerotrie. byte_len( ) ) ;
147218}
0 commit comments