1
+ use std:: collections:: { HashMap , HashSet } ;
2
+
1
3
use indexmap:: IndexSet ;
2
4
3
5
use log:: { debug, info} ;
@@ -8,7 +10,7 @@ use super::PreSubsetContext;
8
10
9
11
pub fn plugin_auto_subset (
10
12
subsets : & mut Vec < IndexSet < u32 > > ,
11
- _remaining_chars_set : & mut IndexSet < u32 > ,
13
+ _remaining_chars_set : & mut HashSet < u32 > ,
12
14
ctx : & mut PreSubsetContext ,
13
15
) {
14
16
let size = ctx. all_unicodes . len ( ) ;
@@ -24,22 +26,30 @@ pub fn plugin_auto_subset(
24
26
"predict subset: {}/subset, {} bytes/char, {}(chunk_size)" ,
25
27
bytes_per_char, chars_per_subset, ctx. predict_bytes_pre_subset
26
28
) ;
27
- let new_subsets = chunk_iterable_and_flat ( subsets, chars_per_subset) ;
29
+ let mut count: usize = 0 ;
30
+ let mut new_used_languages = HashMap :: new ( ) ;
31
+ let new_subsets = subsets
32
+ . iter ( )
33
+ . enumerate ( )
34
+ . flat_map ( |( index, subset) | {
35
+ let res = split_vector ( subset, chars_per_subset) ;
36
+ if let Some ( language) = ctx. used_languages . get ( & index) {
37
+ for _ in 0 ..res. len ( ) {
38
+ new_used_languages. insert ( count, language. clone ( ) ) ;
39
+ count += 1 ;
40
+ }
41
+ }
42
+ res
43
+ } )
44
+ . collect :: < Vec < IndexSet < u32 > > > ( ) ;
28
45
subsets. clear ( ) ;
29
46
for i in new_subsets {
30
47
subsets. push ( i) ;
31
48
}
32
- }
33
-
34
- /// 将集合中的每个子集进一步分割成大小不超过 `max_chunk_size` 的更小子集。
35
- pub fn chunk_iterable_and_flat (
36
- subsets : & mut Vec < IndexSet < u32 > > ,
37
- max_chunk_size : u32 ,
38
- ) -> Vec < IndexSet < u32 > > {
39
- subsets
40
- . iter ( )
41
- . flat_map ( |subset| split_vector ( subset, max_chunk_size) )
42
- . collect :: < Vec < IndexSet < u32 > > > ( )
49
+ // new_used_languages.iter().for_each(|(index, name)| {
50
+ // info!("subset: {} {} {}", index, name, subsets[*index].len());
51
+ // });
52
+ ctx. used_languages = new_used_languages;
43
53
}
44
54
45
55
// 计算当前包需要容纳多少个字符 y= max_count/ x^(1/3)
@@ -104,28 +114,9 @@ mod tests {
104
114
assert_eq ! ( result. len( ) , 4 ) ;
105
115
}
106
116
}
107
- #[ test]
108
- fn for_chunk_iterable_and_flat ( ) {
109
- let mut subsets = vec ! [
110
- IndexSet :: from( [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ] ) ,
111
- IndexSet :: from( [ 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 ] ) ,
112
- ] ;
113
- let result = chunk_iterable_and_flat ( & mut subsets, 5 ) ;
114
- assert_eq ! (
115
- result,
116
- vec![
117
- IndexSet :: from( [ 1 , 2 , 3 , 4 ] ) ,
118
- IndexSet :: from( [ 5 , 6 , 7 , 8 ] ) ,
119
- IndexSet :: from( [ 9 , 10 ] ) ,
120
- IndexSet :: from( [ 11 , 12 , 13 , 14 ] ) ,
121
- IndexSet :: from( [ 15 , 16 , 17 , 18 ] ) ,
122
- IndexSet :: from( [ 19 , 20 , 21 ] ) ,
123
- ]
124
- ) ;
125
- }
126
117
127
118
/// 每隔 n 个元素抽取一个元素
128
- fn extract_every_nth < T : Clone > ( set : & IndexSet < T > , n : usize ) -> Vec < T > {
119
+ fn extract_every_nth < T : Clone > ( set : & HashSet < T > , n : usize ) -> Vec < T > {
129
120
// 检查 n 是否有效
130
121
let n = if n == 0 { 1_usize } else { n } ;
131
122
@@ -142,7 +133,7 @@ fn extract_every_nth<T: Clone>(set: &IndexSet<T>, n: usize) -> Vec<T> {
142
133
}
143
134
#[ test]
144
135
fn main ( ) {
145
- let mut set = IndexSet :: new ( ) ;
136
+ let mut set = HashSet :: new ( ) ;
146
137
set. insert ( 1 ) ;
147
138
set. insert ( 2 ) ;
148
139
set. insert ( 3 ) ;
0 commit comments