Skip to content

Commit 26300db

Browse files
committed
feat: 采用更加激进的分包策略
1 parent 863bf95 commit 26300db

File tree

4 files changed

+37
-10
lines changed

4 files changed

+37
-10
lines changed

crates/lang_unicodes/src/lib.rs

+13-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use cjk_unicodes::{
2-
HANGUL_JAMO, HANGUL_SYL, HIRAGANA_AND_KATAKANA, ZH_SC, ZH_SYMBOL, ZH_TC,
2+
HANGUL_JAMO, HANGUL_SYL, HIRAGANA_AND_KATAKANA, ZH_SC, ZH_TC,
33
};
44
use lazy_static::lazy_static;
55
pub mod cjk_unicodes;
@@ -20,6 +20,12 @@ lazy_static! {
2020
pub static ref LATIN_EXT_B: Vec<u32> = expand_ranges(&[(0x0180, 0x024F)]);
2121

2222

23+
pub static ref HALFWIDTH_FULLWIDTH: Vec<u32> = expand_ranges(&[(0xFF00, 0xFFEF)]);
24+
25+
pub static ref IPA_SYMBOLS: Vec<u32> = expand_ranges(&[(0x0250, 0x02FF)]);
26+
27+
pub static ref ZH_SYMBOL: Vec<u32> = expand_ranges(&[(0x0300,0x036f),(0xFE10, 0xFE4F)]);
28+
2329

2430
pub static ref GREEK: Vec<u32> = expand_ranges(&[(0x0370, 0x03FF), (0x1F00, 0x1FFF)]);
2531

@@ -128,11 +134,13 @@ mod tests {
128134
}
129135
}
130136

131-
pub fn create_default_unicode_area() -> [Vec<u32>; 29] {
137+
pub fn create_default_unicode_area() -> [Vec<u32>; 31] {
132138
[
133139
LATIN.to_vec(),
134140
LATIN_EXT_A.to_vec(),
135141
LATIN_EXT_B.to_vec(),
142+
IPA_SYMBOLS.to_vec(),
143+
HALFWIDTH_FULLWIDTH.to_vec(),
136144
GREEK.to_vec(),
137145
CYRILLIC.to_vec(),
138146
// 中文处理
@@ -164,11 +172,13 @@ pub fn create_default_unicode_area() -> [Vec<u32>; 29] {
164172
NAXI_DONGBA.to_vec(),
165173
]
166174
}
167-
pub fn create_default_unicode_area_tag() -> [&'static str; 29] {
175+
pub fn create_default_unicode_area_tag() -> [&'static str; 31] {
168176
[
169177
"LATIN",
170178
"LATIN_EXT_A",
171179
"LATIN_EXT_B",
180+
"IPA_SYMBOLS",
181+
"HALFWIDTH_FULLWIDTH",
172182
"GREEK",
173183
"CYRILLIC",
174184
// 中文处理

src/pre_subset/mod.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,12 @@ pub fn pre_subset(ctx: &mut Context) {
7272
if ctx.input.language_areas.unwrap_or(true) {
7373
process.push(language_area_plugin);
7474
}
75-
if ctx.input.subset_remain_chars.unwrap_or(true) {
76-
process.push(add_remain_chars_plugin);
77-
}
7875
if ctx.input.auto_subset.unwrap_or(true) {
7976
process.push(plugin_auto_subset);
8077
}
78+
if ctx.input.subset_remain_chars.unwrap_or(true) {
79+
process.push(add_remain_chars_plugin);
80+
}
8181
if ctx.input.font_feature.unwrap_or(true) {
8282
process.push(features_plugin);
8383
}

src/pre_subset/plugin.rs

+17-1
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,26 @@ pub fn add_remain_chars_plugin(
4343
_ctx: &mut PreSubsetContext,
4444
) {
4545
info!("{} 个剩余字符被处理", remaining_chars_set.len());
46-
subsets.push(remaining_chars_set.iter().cloned().collect());
46+
let mut v: Vec<u32> = remaining_chars_set.iter().cloned().collect();
47+
v.sort();
48+
49+
for i in split_into_chunks(v, 70) {
50+
subsets.push(i.iter().cloned().collect());
51+
}
52+
4753
remaining_chars_set.clear();
4854
}
4955

56+
fn split_into_chunks(v: Vec<u32>, chunk_size: usize) -> Vec<Vec<u32>> {
57+
let mut result = Vec::new();
58+
let mut current_chunk;
59+
for chunk in v.chunks(chunk_size) {
60+
current_chunk = Vec::with_capacity(chunk_size);
61+
current_chunk.extend_from_slice(chunk);
62+
result.push(current_chunk);
63+
}
64+
result
65+
}
5066
/// 把数量低于某个值的包,重新规划,缩减碎片分包数
5167
pub fn reduce_min_plugin(
5268
subsets: &mut Vec<IndexSet<u32>>,

src/pre_subset/plugin_auto_subset.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ pub fn plugin_auto_subset(
2424

2525
info!(
2626
"predict subset: {}/subset, {} bytes/char, {}(chunk_size)",
27-
bytes_per_char, chars_per_subset, ctx.predict_bytes_pre_subset
27+
chars_per_subset, bytes_per_char, ctx.predict_bytes_pre_subset
2828
);
2929
let mut count: usize = 0;
3030
let mut new_used_languages = HashMap::new();
@@ -54,9 +54,10 @@ pub fn plugin_auto_subset(
5454

5555
// 计算当前包需要容纳多少个字符 y= max_count/ x^(1/3)
5656
fn length_for_index(x: usize, max_count: u32) -> usize {
57-
let y: f32 = (max_count as f32) / (x as f32).cbrt(); // 计算立方根并求解y
57+
let y: f32 = (max_count as f32) / (x as f32).sqrt(); // 计算立方根并求解y
5858
let y_ceil = y.ceil(); // 将结果向上取整
59-
y_ceil as usize
59+
// 不能比 max_count 的 1/5 小
60+
std::cmp::max(y_ceil as usize, (max_count / 5) as usize)
6061
}
6162
fn split_vector(vec: &IndexSet<u32>, max_count: u32) -> Vec<IndexSet<u32>> {
6263
let mut result: Vec<IndexSet<u32>> = Vec::new();

0 commit comments

Comments
 (0)