@@ -8,6 +8,14 @@ use crate::run_subset::build_single_subset;
8
8
9
9
use super :: PreSubsetContext ;
10
10
11
+ #[ derive( Copy , Clone , Debug ) ]
12
+ pub enum OptLevel {
13
+ NO = 0 ,
14
+ LOW = 1 ,
15
+ MID = 2 ,
16
+ HIGH = 3 ,
17
+ }
18
+
11
19
pub fn plugin_auto_subset (
12
20
subsets : & mut Vec < IndexSet < u32 > > ,
13
21
_remaining_chars_set : & mut HashSet < u32 > ,
@@ -28,12 +36,38 @@ pub fn plugin_auto_subset(
28
36
) ;
29
37
let mut count: usize = 0 ;
30
38
let mut new_used_languages = HashMap :: new ( ) ;
39
+ // 后期用于区分算法,现在暂时无用
40
+ let opt_level = match size {
41
+ 0 ..1000 => OptLevel :: NO ,
42
+ 1000 ..=10000 => OptLevel :: LOW ,
43
+ 10001 ..=30000 => OptLevel :: MID ,
44
+ _ => OptLevel :: HIGH ,
45
+ } ;
31
46
let new_subsets = subsets
32
47
. iter ( )
33
48
. enumerate ( )
34
49
. flat_map ( |( index, subset) | {
35
- let res = split_vector ( subset, chars_per_subset) ;
36
- if let Some ( language) = ctx. used_languages . get ( & index) {
50
+ let lang = ctx. used_languages . get ( & index) ;
51
+ let res = match lang {
52
+ // 繁体中文一般比简体中文要大一倍复杂度,故进行特殊处理
53
+ Some ( ref i) if * i == "ZH_TC" => {
54
+ // 特殊处理ZH_CN的情况
55
+ split_vector (
56
+ subset,
57
+ ( ( chars_per_subset as f32 ) * 0.5_f32 ) as u32 ,
58
+ opt_level,
59
+ ) // 假设对ZH_CN有不一样的处理逻辑
60
+ }
61
+ None => {
62
+ return split_vector (
63
+ subset,
64
+ ( ( chars_per_subset as f32 ) * 0.7_f32 ) as u32 ,
65
+ opt_level,
66
+ ) ;
67
+ }
68
+ _ => split_vector ( subset, chars_per_subset, opt_level) ,
69
+ } ;
70
+ if let Some ( language) = lang {
37
71
for _ in 0 ..res. len ( ) {
38
72
new_used_languages. insert ( count, language. clone ( ) ) ;
39
73
count += 1 ;
@@ -53,13 +87,23 @@ pub fn plugin_auto_subset(
53
87
}
54
88
55
89
// 计算当前包需要容纳多少个字符 y= max_count/ x^(1/3)
56
- fn length_for_index ( x : usize , max_count : u32 ) -> usize {
57
- let y: f32 = ( max_count as f32 ) / ( x as f32 ) . sqrt ( ) ; // 计算立方根并求解y
90
+ fn length_for_index ( x : usize , max_count : u32 , level : OptLevel ) -> usize {
91
+ let min_count = ( max_count / 5 ) as u32 ;
92
+ let y: f32 = match level {
93
+ OptLevel :: NO => ( min_count as f32 ) * ( x as f32 ) ,
94
+ OptLevel :: LOW => ( min_count as f32 ) * ( x as f32 ) . sqrt ( ) ,
95
+ OptLevel :: MID => ( min_count as f32 ) * ( x as f32 ) . cbrt ( ) ,
96
+ OptLevel :: HIGH => ( min_count as f32 ) * ( x as f32 ) . cbrt ( ) ,
97
+ } ; // 计算立方根并求解y
58
98
let y_ceil = y. ceil ( ) ; // 将结果向上取整
59
99
// 不能比 max_count 的 1/5 小
60
- std:: cmp:: max ( y_ceil as usize , ( max_count / 5 ) as usize )
100
+ std:: cmp:: min ( y_ceil as usize , ( max_count) as usize )
61
101
}
62
- fn split_vector ( vec : & IndexSet < u32 > , max_count : u32 ) -> Vec < IndexSet < u32 > > {
102
+ fn split_vector (
103
+ vec : & IndexSet < u32 > ,
104
+ max_count : u32 ,
105
+ level : OptLevel ,
106
+ ) -> Vec < IndexSet < u32 > > {
63
107
let mut result: Vec < IndexSet < u32 > > = Vec :: new ( ) ;
64
108
let mut current_start = 0 ;
65
109
let size = vec. len ( ) ;
@@ -69,7 +113,7 @@ fn split_vector(vec: &IndexSet<u32>, max_count: u32) -> Vec<IndexSet<u32>> {
69
113
debug ! ( "fold {} -> {} | max {}" , size, i - 1 , max_count) ;
70
114
break ;
71
115
}
72
- let len = length_for_index ( i, max_count) ;
116
+ let len = length_for_index ( i, max_count, level . clone ( ) ) ;
73
117
// println!("{}", len);
74
118
let to_take = std:: cmp:: min ( len, size - current_start) ;
75
119
let new_sub_vec = IndexSet :: from_iter (
@@ -88,7 +132,7 @@ mod tests {
88
132
#[ test]
89
133
fn split_vector_empty_input_empty_result ( ) {
90
134
let input = IndexSet :: new ( ) ;
91
- let result = split_vector ( & input, 150 ) ;
135
+ let result = split_vector ( & input, 150 , OptLevel :: LOW ) ;
92
136
assert ! ( result. is_empty( ) ) ;
93
137
}
94
138
@@ -98,7 +142,7 @@ mod tests {
98
142
for x in 1 ..10 {
99
143
input. insert ( x) ;
100
144
}
101
- let result = split_vector ( & input, 150 ) ;
145
+ let result = split_vector ( & input, 150 , OptLevel :: LOW ) ;
102
146
assert_eq ! ( result. len( ) , 1 ) ;
103
147
assert_eq ! ( result[ 0 ] , input) ;
104
148
// println!("result: {:?}", result)
@@ -110,7 +154,7 @@ mod tests {
110
154
for x in 1 ..400 {
111
155
input. insert ( x) ;
112
156
}
113
- let result = split_vector ( & input, 150 ) ;
157
+ let result = split_vector ( & input, 150 , OptLevel :: LOW ) ;
114
158
println ! ( "result: {:#?}" , result) ;
115
159
assert_eq ! ( result. len( ) , 4 ) ;
116
160
}
0 commit comments