Skip to content

Commit 16cfd40

Browse files
committed
performance: 修复 IndexSet 导致的性能问题;增加语言标记
1 parent 4c3f19a commit 16cfd40

File tree

7 files changed

+111
-69
lines changed

7 files changed

+111
-69
lines changed

crates/lang_unicodes/src/lib.rs

+36
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,39 @@ pub fn create_default_unicode_area() -> [Vec<u32>; 29] {
164164
NAXI_DONGBA.to_vec(),
165165
]
166166
}
167+
pub fn create_default_unicode_area_tag() -> [&'static str; 29] {
168+
[
169+
"LATIN",
170+
"LATIN_EXT_A",
171+
"LATIN_EXT_B",
172+
"GREEK",
173+
"CYRILLIC",
174+
// 中文处理
175+
"ZH_SYMBOL",
176+
"ZH_SC",
177+
"ZH_TC",
178+
// 日文处理
179+
"HIRAGANA_AND_KATAKANA",
180+
// 韩文处理
181+
"HANGUL_JAMO",
182+
"HANGUL_SYL",
183+
"BENGALI",
184+
"ARABIC",
185+
"DEVANAGARI",
186+
"THAI",
187+
"KHMER",
188+
"TIBETAN",
189+
"MONGOLIAN",
190+
"TAI_LUE",
191+
"YI",
192+
"PHAGS_PA",
193+
"LISU",
194+
"BUHID",
195+
"MIAO",
196+
"HANI",
197+
"LAHU",
198+
"VA",
199+
"ZHUANG",
200+
"NAXI_DONGBA",
201+
]
202+
}

src/lib.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,13 @@ fn main_test() {
4040
// file_name: Some("input.css".to_string()),
4141
..Default::default()
4242
}),
43-
// chunk_size: Some(40 * 1024),
43+
chunk_size: Some(50 * 1024),
4444
// 精确控制
4545
// subsets: vec![[65]].iter().map(|x| u32_array_to_u8_array(x)).collect(),
4646
// language_areas: Some(false),
4747
// auto_subset: Some(false),
48-
// font_feature: Some(false),
48+
// subset_remain_chars: Some(false),
49+
font_feature: Some(false),
4950
// reduce_mins: Some(false),
5051
// rename_output_font: Some("font_[hash:6].[ext]".to_string()),
5152
..Default::default()
@@ -72,5 +73,5 @@ fn main_test() {
7273
}
7374

7475
test_on("./packages/demo/public/SmileySans-Oblique.ttf", "ttf");
75-
test_on("./packages/demo/public/SmileySans-Oblique.ttf.woff2", "woff2");
76+
// test_on("./packages/demo/public/SmileySans-Oblique.ttf.woff2", "woff2");
7677
}

src/pre_subset/features.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,15 @@ use cmap::analyze_cmap;
22
use gpos::analyze_gpos;
33
use gsub::analyze_gsub;
44
use indexmap::IndexSet;
5-
use std::collections::HashMap;
5+
use std::collections::{HashMap, HashSet};
66

77
use super::PreSubsetContext;
88
pub mod cmap;
99
pub mod gpos;
1010
pub mod gsub;
1111
pub fn features_plugin(
1212
subsets: &mut Vec<IndexSet<u32>>,
13-
_remaining_chars_set: &mut IndexSet<u32>,
13+
_remaining_chars_set: &mut HashSet<u32>,
1414
ctx: &mut PreSubsetContext,
1515
) {
1616
let cmap = analyze_cmap(ctx.font, ctx.font_file);

src/pre_subset/mod.rs

+10-7
Original file line numberDiff line numberDiff line change
@@ -16,25 +16,29 @@ use plugin::{
1616
add_remain_chars_plugin, language_area_plugin, reduce_min_plugin,
1717
};
1818
use plugin_auto_subset::plugin_auto_subset;
19-
use std::io::Cursor;
19+
use std::{
20+
collections::{HashMap, HashSet},
21+
io::Cursor,
22+
};
2023

2124
pub struct PreSubsetContext<'a, 'b, 'c>
2225
where
2326
'b: 'a,
2427
'c: 'a,
2528
{
26-
all_unicodes: IndexSet<u32>,
29+
all_unicodes: HashSet<u32>,
2730
face: &'a mut Owned<Face<'b>>,
2831
predict_bytes_pre_subset: u32,
2932
font: &'a opentype::Font,
3033
font_file: &'a mut Cursor<&'c [u8]>,
3134
subsets: &'c Vec<Vec<u32>>,
35+
used_languages: HashMap<usize, String>,
3236
}
3337

3438
pub fn pre_subset(ctx: &mut Context) {
3539
let file_binary = &*ctx.binary;
36-
let mut all_unicodes: IndexSet<u32> =
37-
IndexSet::from_iter(ctx.face.collect_unicodes());
40+
let mut all_unicodes: HashSet<u32> =
41+
HashSet::from_iter(ctx.face.collect_unicodes());
3842

3943
let mut font_file = Cursor::new(file_binary);
4044
let font = opentype::Font::read(&mut font_file)
@@ -54,12 +58,13 @@ pub fn pre_subset(ctx: &mut Context) {
5458
font: &font,
5559
subsets: &user_subsets,
5660
font_file: &mut font_file,
61+
used_languages: HashMap::new(),
5762
};
5863

5964
let mut process: Vec<
6065
fn(
6166
&mut Vec<IndexSet<u32>>,
62-
&mut IndexSet<u32>,
67+
&mut HashSet<u32>,
6368
&mut PreSubsetContext<'_, '_, '_>,
6469
),
6570
> = vec![];
@@ -82,8 +87,6 @@ pub fn pre_subset(ctx: &mut Context) {
8287
for p in process {
8388
p(&mut subsets, &mut all_unicodes, &mut context);
8489
}
85-
86-
// let set = analyze_gsub(&font, &mut font_file);
8790
ctx.pre_subset_result = subsets
8891
.iter()
8992
.filter(|v| v.len() > 0)

src/pre_subset/plugin.rs

+30-21
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,56 @@
1+
use std::collections::HashSet;
2+
13
use indexmap::IndexSet;
24

3-
use lang_unicodes::create_default_unicode_area;
5+
use lang_unicodes::{
6+
create_default_unicode_area, create_default_unicode_area_tag,
7+
};
48
use log::info;
59

610
use super::PreSubsetContext;
711

812
pub fn language_area_plugin(
913
subsets: &mut Vec<IndexSet<u32>>,
10-
remaining_chars_set: &mut IndexSet<u32>,
11-
_ctx: &mut PreSubsetContext,
14+
remaining_chars_set: &mut HashSet<u32>,
15+
ctx: &mut PreSubsetContext,
1216
) {
1317
let language_area = create_default_unicode_area();
14-
language_area.iter().for_each(|area| {
15-
let set = IndexSet::from_iter(
16-
area.iter()
17-
.filter(|c| {
18-
let is_in_remain = remaining_chars_set.contains(*c);
19-
// ! 副作用,从剩余字符中删除这个字符
20-
remaining_chars_set.shift_remove(*c);
21-
is_in_remain
22-
})
23-
.map(|c| c.clone()),
24-
);
25-
if set.len() > 0 {
26-
subsets.push(set);
27-
}
28-
});
18+
let language_area_tag = create_default_unicode_area_tag();
19+
language_area.iter().zip(language_area_tag.iter()).enumerate().for_each(
20+
|(index, (area, tag))| {
21+
let set = IndexSet::from_iter(
22+
area.iter()
23+
.filter(|c| {
24+
let is_in_remain = remaining_chars_set.contains(*c);
25+
// ! 副作用,从剩余字符中删除这个字符
26+
remaining_chars_set.remove(*c);
27+
is_in_remain
28+
})
29+
.map(|c| c.clone()),
30+
);
31+
if set.len() > 0 {
32+
ctx.used_languages.insert(index, tag.to_string());
33+
// println!("{tag} {}", set.len());
34+
subsets.push(set);
35+
}
36+
},
37+
);
2938
}
3039

3140
pub fn add_remain_chars_plugin(
3241
subsets: &mut Vec<IndexSet<u32>>,
33-
remaining_chars_set: &mut IndexSet<u32>,
42+
remaining_chars_set: &mut HashSet<u32>,
3443
_ctx: &mut PreSubsetContext,
3544
) {
3645
info!("{} 个剩余字符被处理", remaining_chars_set.len());
37-
subsets.push(remaining_chars_set.clone());
46+
subsets.push(remaining_chars_set.iter().cloned().collect());
3847
remaining_chars_set.clear();
3948
}
4049

4150
/// 把数量低于某个值的包,重新规划,缩减碎片分包数
4251
pub fn reduce_min_plugin(
4352
subsets: &mut Vec<IndexSet<u32>>,
44-
_remaining_chars_set: &mut IndexSet<u32>,
53+
_remaining_chars_set: &mut HashSet<u32>,
4554
_ctx: &mut PreSubsetContext,
4655
) {
4756
// TODO 抽取为定义

src/pre_subset/plugin_add_user_subset.rs

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1+
use std::collections::HashSet;
2+
13
use super::PreSubsetContext;
24
use indexmap::IndexSet;
35

46
// 添加用户的 subsets 到第一位
57
pub fn plugin_add_user_subset(
68
subsets: &mut Vec<IndexSet<u32>>,
7-
_remaining_chars_set: &mut IndexSet<u32>,
9+
_remaining_chars_set: &mut HashSet<u32>,
810
ctx: &mut PreSubsetContext,
911
) {
1012
ctx.subsets.iter().for_each(|u32_arr: &Vec<u32>| {
1113
let mut subset: IndexSet<u32> = IndexSet::new();
1214
u32_arr.iter().for_each(|x| {
1315
subset.insert(x.clone());
14-
_remaining_chars_set.shift_remove(x);
16+
_remaining_chars_set.remove(x);
1517
});
1618
subsets.push(subset);
1719
});

src/pre_subset/plugin_auto_subset.rs

+25-34
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::collections::{HashMap, HashSet};
2+
13
use indexmap::IndexSet;
24

35
use log::{debug, info};
@@ -8,7 +10,7 @@ use super::PreSubsetContext;
810

911
pub fn plugin_auto_subset(
1012
subsets: &mut Vec<IndexSet<u32>>,
11-
_remaining_chars_set: &mut IndexSet<u32>,
13+
_remaining_chars_set: &mut HashSet<u32>,
1214
ctx: &mut PreSubsetContext,
1315
) {
1416
let size = ctx.all_unicodes.len();
@@ -24,22 +26,30 @@ pub fn plugin_auto_subset(
2426
"predict subset: {}/subset, {} bytes/char, {}(chunk_size)",
2527
bytes_per_char, chars_per_subset, ctx.predict_bytes_pre_subset
2628
);
27-
let new_subsets = chunk_iterable_and_flat(subsets, chars_per_subset);
29+
let mut count: usize = 0;
30+
let mut new_used_languages = HashMap::new();
31+
let new_subsets = subsets
32+
.iter()
33+
.enumerate()
34+
.flat_map(|(index, subset)| {
35+
let res = split_vector(subset, chars_per_subset);
36+
if let Some(language) = ctx.used_languages.get(&index) {
37+
for _ in 0..res.len() {
38+
new_used_languages.insert(count, language.clone());
39+
count += 1;
40+
}
41+
}
42+
res
43+
})
44+
.collect::<Vec<IndexSet<u32>>>();
2845
subsets.clear();
2946
for i in new_subsets {
3047
subsets.push(i);
3148
}
32-
}
33-
34-
/// 将集合中的每个子集进一步分割成大小不超过 `max_chunk_size` 的更小子集。
35-
pub fn chunk_iterable_and_flat(
36-
subsets: &mut Vec<IndexSet<u32>>,
37-
max_chunk_size: u32,
38-
) -> Vec<IndexSet<u32>> {
39-
subsets
40-
.iter()
41-
.flat_map(|subset| split_vector(subset, max_chunk_size))
42-
.collect::<Vec<IndexSet<u32>>>()
49+
// new_used_languages.iter().for_each(|(index, name)| {
50+
// info!("subset: {} {} {}", index, name, subsets[*index].len());
51+
// });
52+
ctx.used_languages = new_used_languages;
4353
}
4454

4555
// 计算当前包需要容纳多少个字符 y= max_count/ x^(1/3)
@@ -104,28 +114,9 @@ mod tests {
104114
assert_eq!(result.len(), 4);
105115
}
106116
}
107-
#[test]
108-
fn for_chunk_iterable_and_flat() {
109-
let mut subsets = vec![
110-
IndexSet::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
111-
IndexSet::from([11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
112-
];
113-
let result = chunk_iterable_and_flat(&mut subsets, 5);
114-
assert_eq!(
115-
result,
116-
vec![
117-
IndexSet::from([1, 2, 3, 4]),
118-
IndexSet::from([5, 6, 7, 8]),
119-
IndexSet::from([9, 10]),
120-
IndexSet::from([11, 12, 13, 14]),
121-
IndexSet::from([15, 16, 17, 18]),
122-
IndexSet::from([19, 20, 21]),
123-
]
124-
);
125-
}
126117

127118
/// 每隔 n 个元素抽取一个元素
128-
fn extract_every_nth<T: Clone>(set: &IndexSet<T>, n: usize) -> Vec<T> {
119+
fn extract_every_nth<T: Clone>(set: &HashSet<T>, n: usize) -> Vec<T> {
129120
// 检查 n 是否有效
130121
let n = if n == 0 { 1_usize } else { n };
131122

@@ -142,7 +133,7 @@ fn extract_every_nth<T: Clone>(set: &IndexSet<T>, n: usize) -> Vec<T> {
142133
}
143134
#[test]
144135
fn main() {
145-
let mut set = IndexSet::new();
136+
let mut set = HashSet::new();
146137
set.insert(1);
147138
set.insert(2);
148139
set.insert(3);

0 commit comments

Comments
 (0)