feat: 采用更加激进的分包策略

KonghaYao · KonghaYao · commit 26300db0d3a9 · 2025-03-05T22:10:42.000+08:00
diff --git a/crates/lang_unicodes/src/lib.rs b/crates/lang_unicodes/src/lib.rs
@@ -1,5 +1,5 @@
 use cjk_unicodes::{
-    HANGUL_JAMO, HANGUL_SYL, HIRAGANA_AND_KATAKANA, ZH_SC, ZH_SYMBOL, ZH_TC,
+    HANGUL_JAMO, HANGUL_SYL, HIRAGANA_AND_KATAKANA, ZH_SC, ZH_TC,
 };
 use lazy_static::lazy_static;
 pub mod cjk_unicodes;
@@ -20,6 +20,12 @@ lazy_static! {
     pub static ref LATIN_EXT_B: Vec<u32> = expand_ranges(&[(0x0180, 0x024F)]);
 
 
+    pub static ref HALFWIDTH_FULLWIDTH: Vec<u32> = expand_ranges(&[(0xFF00, 0xFFEF)]);
+
+    pub static ref IPA_SYMBOLS: Vec<u32> = expand_ranges(&[(0x0250, 0x02FF)]);
+
+    pub static ref ZH_SYMBOL: Vec<u32> = expand_ranges(&[(0x0300,0x036f),(0xFE10, 0xFE4F)]);
+
 
     pub static ref GREEK: Vec<u32> = expand_ranges(&[(0x0370, 0x03FF), (0x1F00, 0x1FFF)]);
 
@@ -128,11 +134,13 @@ mod tests {
     }
 }
 
-pub fn create_default_unicode_area() -> [Vec<u32>; 29] {
+pub fn create_default_unicode_area() -> [Vec<u32>; 31] {
     [
         LATIN.to_vec(),
         LATIN_EXT_A.to_vec(),
         LATIN_EXT_B.to_vec(),
+        IPA_SYMBOLS.to_vec(),
+        HALFWIDTH_FULLWIDTH.to_vec(),
         GREEK.to_vec(),
         CYRILLIC.to_vec(),
         // 中文处理
@@ -164,11 +172,13 @@ pub fn create_default_unicode_area() -> [Vec<u32>; 29] {
         NAXI_DONGBA.to_vec(),
     ]
 }
-pub fn create_default_unicode_area_tag() -> [&'static str; 29] {
+pub fn create_default_unicode_area_tag() -> [&'static str; 31] {
     [
         "LATIN",
         "LATIN_EXT_A",
         "LATIN_EXT_B",
+        "IPA_SYMBOLS",
+        "HALFWIDTH_FULLWIDTH",
         "GREEK",
         "CYRILLIC",
         // 中文处理
diff --git a/src/pre_subset/mod.rs b/src/pre_subset/mod.rs
@@ -72,12 +72,12 @@ pub fn pre_subset(ctx: &mut Context) {
     if ctx.input.language_areas.unwrap_or(true) {
         process.push(language_area_plugin);
     }
-    if ctx.input.subset_remain_chars.unwrap_or(true) {
-        process.push(add_remain_chars_plugin);
-    }
     if ctx.input.auto_subset.unwrap_or(true) {
         process.push(plugin_auto_subset);
     }
+    if ctx.input.subset_remain_chars.unwrap_or(true) {
+        process.push(add_remain_chars_plugin);
+    }
     if ctx.input.font_feature.unwrap_or(true) {
         process.push(features_plugin);
     }
diff --git a/src/pre_subset/plugin.rs b/src/pre_subset/plugin.rs
@@ -43,10 +43,26 @@ pub fn add_remain_chars_plugin(
     _ctx: &mut PreSubsetContext,
 ) {
     info!("{} 个剩余字符被处理", remaining_chars_set.len());
-    subsets.push(remaining_chars_set.iter().cloned().collect());
+    let mut v: Vec<u32> = remaining_chars_set.iter().cloned().collect();
+    v.sort();
+
+    for i in split_into_chunks(v, 70) {
+        subsets.push(i.iter().cloned().collect());
+    }
+
     remaining_chars_set.clear();
 }
 
+fn split_into_chunks(v: Vec<u32>, chunk_size: usize) -> Vec<Vec<u32>> {
+    let mut result = Vec::new();
+    let mut current_chunk;
+    for chunk in v.chunks(chunk_size) {
+        current_chunk = Vec::with_capacity(chunk_size);
+        current_chunk.extend_from_slice(chunk);
+        result.push(current_chunk);
+    }
+    result
+}
 /// 把数量低于某个值的包，重新规划，缩减碎片分包数
 pub fn reduce_min_plugin(
     subsets: &mut Vec<IndexSet<u32>>,
diff --git a/src/pre_subset/plugin_auto_subset.rs b/src/pre_subset/plugin_auto_subset.rs
@@ -24,7 +24,7 @@ pub fn plugin_auto_subset(
 
     info!(
         "predict subset: {}/subset, {} bytes/char, {}(chunk_size)",
-        bytes_per_char, chars_per_subset, ctx.predict_bytes_pre_subset
+        chars_per_subset, bytes_per_char, ctx.predict_bytes_pre_subset
     );
     let mut count: usize = 0;
     let mut new_used_languages = HashMap::new();
@@ -54,9 +54,10 @@ pub fn plugin_auto_subset(
 
 // 计算当前包需要容纳多少个字符 y= max_count/ x^(1/3)
 fn length_for_index(x: usize, max_count: u32) -> usize {
-    let y: f32 = (max_count as f32) / (x as f32).cbrt(); // 计算立方根并求解y
+    let y: f32 = (max_count as f32) / (x as f32).sqrt(); // 计算立方根并求解y
     let y_ceil = y.ceil(); // 将结果向上取整
-    y_ceil as usize
+                           // 不能比 max_count 的 1/5 小
+    std::cmp::max(y_ceil as usize, (max_count / 5) as usize)
 }
 fn split_vector(vec: &IndexSet<u32>, max_count: u32) -> Vec<IndexSet<u32>> {
     let mut result: Vec<IndexSet<u32>> = Vec::new();

Original file line number	Diff line number	Diff line change
`@@ -72,12 +72,12 @@ pub fn pre_subset(ctx: &mut Context) {`
`72`	`72`	`if ctx.input.language_areas.unwrap_or(true) {`
`73`	`73`	`process.push(language_area_plugin);`
`74`	`74`	`}`
`75`		`- if ctx.input.subset_remain_chars.unwrap_or(true) {`
`76`		`- process.push(add_remain_chars_plugin);`
`77`		`- }`
`78`	`75`	`if ctx.input.auto_subset.unwrap_or(true) {`
`79`	`76`	`process.push(plugin_auto_subset);`
`80`	`77`	`}`
	`78`	`+ if ctx.input.subset_remain_chars.unwrap_or(true) {`
	`79`	`+ process.push(add_remain_chars_plugin);`
	`80`	`+ }`
`81`	`81`	`if ctx.input.font_feature.unwrap_or(true) {`
`82`	`82`	`process.push(features_plugin);`
`83`	`83`	`}`