Skip to content

Commit 2e2eb17

Browse files
committed
Implement some optimizations for init font move processing.
Specifically speed up the case where the font being analyzed has a very large number of segments: - Fix a couple of places that were processing all segments in the font to only process candidates for the init font move. - Add a batch phase to the start of init font processing which identifies and moves multiple inert segments in one batch operation. - Individual processing as usual is performed on anything remaining after the batch processing step.
1 parent 8796a83 commit 2e2eb17

File tree

8 files changed

+182
-66
lines changed

8 files changed

+182
-66
lines changed

ift/encoder/candidate_merge.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ static btree_map<ActivationCondition, GlyphSet> PatchesWithGlyphs(
313313
return result;
314314
}
315315

316-
StatusOr<double> CandidateMerge::ComputeInitFontCostDelta(
316+
StatusOr<std::pair<double, GlyphSet>> CandidateMerge::ComputeInitFontCostDelta(
317317
Merger& merger, uint32_t existing_init_font_size, bool best_case,
318318
const GlyphSet& moved_glyphs) {
319319
VLOG(1) << "cost_delta for move of glyphs " << moved_glyphs.ToString()
@@ -407,7 +407,7 @@ StatusOr<double> CandidateMerge::ComputeInitFontCostDelta(
407407

408408
VLOG(1) << " = " << total_delta;
409409

410-
return total_delta;
410+
return std::make_pair(total_delta, glyph_closure_delta);
411411
}
412412

413413
StatusOr<double> CandidateMerge::ComputeCostDelta(

ift/encoder/candidate_merge.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,13 @@ struct CandidateMerge {
186186

187187
// Computes the predicted change to the toal cost if moved_glyphs are
188188
// moved from patches into the initial font.
189-
static absl::StatusOr<double> ComputeInitFontCostDelta(
190-
Merger& merger, uint32_t existing_init_font_size, bool best_case,
191-
const common::GlyphSet& moved_glyphs);
189+
//
190+
// Returns the cost delta, and the full set of glyphs that will be moved
191+
// (including those added by closure).
192+
static absl::StatusOr<std::pair<double, common::GlyphSet>>
193+
ComputeInitFontCostDelta(Merger& merger, uint32_t existing_init_font_size,
194+
bool best_case,
195+
const common::GlyphSet& moved_glyphs);
192196

193197
static absl::StatusOr<double> ComputePatchMergeCostDelta(
194198
const Merger& context, segment_index_t base_segment,

ift/encoder/closure_glyph_segmenter_test.cc

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -598,15 +598,15 @@ TEST_F(ClosureGlyphSegmenterTest, SimpleSegmentation_PatchMerge) {
598598
{{' ', ' '}, 1000},
599599
{{'a', 'a'}, 1000},
600600
{{'!', '!'}, 999},
601-
{{0x203C, 0x203C}, 1} // !!
601+
{{0x203C, 0x203C}, 1} // !!
602602
};
603603

604-
MergeStrategy strategy = *MergeStrategy::CostBased(std::move(frequencies), 75, 1);
604+
MergeStrategy strategy =
605+
*MergeStrategy::CostBased(std::move(frequencies), 75, 1);
605606
strategy.SetUsePatchMerges(true);
606607

607608
auto segmentation = segmenter.CodepointToGlyphSegments(
608-
roboto.get(), {}, {{'a'}, {'!'}, {0x203C}},
609-
strategy);
609+
roboto.get(), {}, {{'a'}, {'!'}, {0x203C}}, strategy);
610610
ASSERT_TRUE(segmentation.ok()) << segmentation.status();
611611

612612
std::vector<SubsetDefinition> expected_segments = {{'a'}, {'!'}, {0x203C}};
@@ -626,19 +626,21 @@ TEST_F(ClosureGlyphSegmenterTest, SimpleSegmentation_NoPatchMerge) {
626626
{{' ', ' '}, 1000},
627627
{{'a', 'a'}, 1000},
628628
{{'!', '!'}, 999},
629-
{{0x203C, 0x203C}, 1} // !!
629+
{{0x203C, 0x203C}, 1} // !!
630630
};
631631

632-
MergeStrategy strategy = *MergeStrategy::CostBased(std::move(frequencies), 75, 1);
632+
MergeStrategy strategy =
633+
*MergeStrategy::CostBased(std::move(frequencies), 75, 1);
633634
strategy.SetUsePatchMerges(false);
634635

635636
auto segmentation = segmenter.CodepointToGlyphSegments(
636-
roboto.get(), {}, {{'a'}, {'!'}, {0x203C}},
637-
strategy);
637+
roboto.get(), {}, {{'a'}, {'!'}, {0x203C}}, strategy);
638638
ASSERT_TRUE(segmentation.ok()) << segmentation.status();
639639

640-
// Because the condition for ! OR !! has high probability it get's merged with a
641-
std::vector<SubsetDefinition> expected_segments = {{'a', '!', 0x203c}, {}, {}};
640+
// Because the condition for ! OR !! has high probability it get's merged with
641+
// a
642+
std::vector<SubsetDefinition> expected_segments = {
643+
{'a', '!', 0x203c}, {}, {}};
642644
ASSERT_EQ(segmentation->Segments(), expected_segments);
643645

644646
ASSERT_EQ(segmentation->ToString(),
@@ -899,8 +901,6 @@ if (s0) then p0
899901
)");
900902
}
901903

902-
// TODO XXXX InitFontMerging w/ Probability Threshold
903-
904904
TEST_F(ClosureGlyphSegmenterTest, InitFontMerging) {
905905
// In this test we enable merging of segments into the init font
906906
UnicodeFrequencies frequencies{
@@ -938,6 +938,44 @@ if (s2) then p0
938938
)");
939939
}
940940

941+
TEST_F(ClosureGlyphSegmenterTest, InitFontMerging_WithProbabilityThreshold) {
942+
// In this test we enable merging of segments into the init font
943+
UnicodeFrequencies frequencies{
944+
{{'a', 'a'}, 100},
945+
{{'b', 'b'}, 97},
946+
{{'c', 'c'}, 97},
947+
{{'d', 'd'}, 97},
948+
949+
// b and c co-occur
950+
{{'b', 'c'}, 97},
951+
};
952+
953+
MergeStrategy strategy =
954+
*MergeStrategy::BigramCostBased(std::move(frequencies));
955+
strategy.SetInitFontMergeThreshold(-75);
956+
strategy.SetInitFontMergeProbabilityThreshold(0.98);
957+
958+
auto segmentation = segmenter.CodepointToGlyphSegments(
959+
roboto.get(), {}, {{'a'}, {'d'}, {'b'}, {'c'}}, strategy);
960+
ASSERT_TRUE(segmentation.ok()) << segmentation.status();
961+
962+
// 'a' will be moved to the init font, leaving the other segments
963+
// to be merged together.
964+
std::vector<SubsetDefinition> expected_segments = {
965+
{},
966+
{'b', 'c', 'd'},
967+
{},
968+
{},
969+
};
970+
ASSERT_EQ(segmentation->Segments(), expected_segments);
971+
972+
ASSERT_EQ(segmentation->ToString(),
973+
R"(initial font: { gid0, gid69 }
974+
p0: { gid70, gid71, gid72 }
975+
if (s1) then p0
976+
)");
977+
}
978+
941979
TEST_F(ClosureGlyphSegmenterTest,
942980
InitFontMerging_DisjunctiveCheckedIndividually) {
943981
UnicodeFrequencies frequencies{

ift/encoder/merge_strategy.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,7 @@ class MergeStrategy {
153153
init_font_merge_probability_threshold_ = value;
154154
}
155155

156-
void SetUsePatchMerges(bool value) {
157-
use_patch_merges_ = value;
158-
}
156+
void SetUsePatchMerges(bool value) { use_patch_merges_ = value; }
159157

160158
bool operator==(const MergeStrategy& other) const {
161159
return use_costs_ == other.use_costs_ &&

ift/encoder/merger.cc

Lines changed: 113 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -71,19 +71,19 @@ Merger::TryNextMerge() {
7171
return std::nullopt;
7272
}
7373

74-
Status Merger::MoveSegmentsToInitFont() {
75-
if (!strategy_.InitFontMergeThreshold().has_value()) {
76-
return absl::FailedPreconditionError(
77-
"Cannot be called when there is no merge threshold configured.");
78-
}
74+
SegmentSet Merger::InitFontSegmentsToCheck(const SegmentSet& inscope) const {
75+
SegmentSet to_check = inscope;
7976

80-
VLOG(0) << "Checking if there are any segments which should be moved into "
81-
"the initial font.";
77+
SegmentSet excluded = CutoffSegments();
78+
// Shared segments aren't subject to optimization cutoff. So only exclude
79+
// those in inscope_segments_ (which is all of the non-shared segments)
80+
excluded.intersect(inscope_segments_);
81+
to_check.subtract(excluded);
8282

83-
// TODO(garretrieger): This implementation can be further optimized:
84-
// - Make ReassignInitSegment() an incremental update instead of a full
85-
// reprocessing.
83+
return to_check;
84+
}
8685

86+
SegmentSet Merger::InitFontApplyProbabilityThreshold() const {
8787
SegmentSet below_threshold;
8888
if (strategy_.InitFontMergeProbabilityThreshold().has_value()) {
8989
for (segment_index_t s : inscope_segments_for_init_move_) {
@@ -96,60 +96,120 @@ Status Merger::MoveSegmentsToInitFont() {
9696

9797
SegmentSet inscope = inscope_segments_for_init_move_;
9898
inscope.subtract(below_threshold);
99-
VLOG(0) << inscope.size() << " inscope segments, " << below_threshold.size() << " skipped for being below the probability threshold.";
10099

101-
do {
102-
SegmentSet to_check = inscope;
100+
VLOG(0) << inscope.size() << " inscope segments, " << below_threshold.size()
101+
<< " skipped for being below the probability threshold.";
102+
return inscope;
103+
}
104+
105+
btree_map<ActivationCondition, GlyphSet> Merger::InitFontConditionsToCheck(
106+
const SegmentSet& to_check, bool batch_mode) const {
107+
// We only want to check conditions that use at least one segment which is
108+
// inscope for moving to the init font.
109+
btree_map<ActivationCondition, GlyphSet> conditions;
110+
for (segment_index_t s : to_check) {
111+
for (const auto& c :
112+
Context().glyph_groupings.TriggeringSegmentToConditions(s)) {
113+
if (conditions.contains(c)) {
114+
continue;
115+
}
116+
117+
if (batch_mode) {
118+
SegmentSet triggering_segments = c.TriggeringSegments();
119+
if (triggering_segments.size() != 1 ||
120+
!Context().InertSegments().contains(*triggering_segments.begin())) {
121+
// Non-inert conditions are skipped during the batch processing.
122+
continue;
123+
}
124+
}
125+
126+
GlyphSet glyphs = Context().glyph_groupings.ConditionsAndGlyphs().at(c);
127+
conditions.insert(std::make_pair(c, glyphs));
128+
}
129+
}
130+
return conditions;
131+
}
132+
133+
Status Merger::MoveSegmentsToInitFont() {
134+
if (!strategy_.InitFontMergeThreshold().has_value()) {
135+
return absl::FailedPreconditionError(
136+
"Cannot be called when there is no merge threshold configured.");
137+
}
103138

104-
SegmentSet excluded = CutoffSegments();
105-
// Shared segments aren't subject to optimization cutoff. So only exclude
106-
// those in inscope_segments_ (which is all of the non-shared segments)
107-
excluded.intersect(inscope_segments_);
139+
VLOG(0) << "Checking if there are any segments which should be moved into "
140+
"the initial font.";
108141

109-
to_check.subtract(excluded);
142+
SegmentSet inscope = InitFontApplyProbabilityThreshold();
143+
144+
// Init move processing works in two phases:
145+
//
146+
// First is batch mode. In batch mode only inert segments are checked
147+
// for move. Any segments that are below the threshold are moved to the
148+
// init font in a single operation. Because inert segments are not
149+
// expected to interact we don't need to reform the closure analysis
150+
// after each individual move to get an accurate cost delta.
151+
//
152+
// Once batch processing has no more moves left, the processing switches
153+
// to non-batch processing where all candidate conditions are checked
154+
// and moved one at a time.
155+
156+
bool batch_mode = true;
157+
VLOG(0) << " batch checking inert segments for move to init font.";
158+
do {
159+
SegmentSet to_check = InitFontSegmentsToCheck(inscope);
110160

111161
uint32_t init_font_size = TRY(Context().patch_size_cache->GetPatchSize(
112162
Context().SegmentationInfo().InitFontGlyphs()));
113163

164+
double total_delta = 0.0;
114165
double lowest_delta = *strategy_.InitFontMergeThreshold();
115166
std::optional<GlyphSet> glyphs_for_lowest = std::nullopt;
116167

117-
// We only want to check conditions that use at least one segment which is
118-
// inscope for moving to the init font.
119-
btree_map<ActivationCondition, GlyphSet> conditions;
120-
for (segment_index_t s : to_check) {
121-
for (const auto& c : Context().glyph_groupings.TriggeringSegmentToConditions(s)) {
122-
if (conditions.contains(c)) {
123-
continue;
124-
}
125-
126-
GlyphSet glyphs = Context().glyph_groupings.ConditionsAndGlyphs().at(c);
127-
conditions.insert(std::make_pair(c, glyphs));
128-
}
129-
}
168+
btree_map<ActivationCondition, GlyphSet> conditions =
169+
InitFontConditionsToCheck(to_check, batch_mode);
130170

131171
for (const auto& [condition, glyphs] : conditions) {
132-
double best_case_delta = TRY(CandidateMerge::ComputeInitFontCostDelta(
172+
auto [best_case_delta, _] = TRY(CandidateMerge::ComputeInitFontCostDelta(
133173
*this, init_font_size, true, glyphs));
134174
if (best_case_delta >= lowest_delta) {
135175
// Filter by best case first which is much faster to compute.
136176
continue;
137177
}
138178

139-
double delta = TRY(CandidateMerge::ComputeInitFontCostDelta(
179+
auto [delta, all_glyphs] = TRY(CandidateMerge::ComputeInitFontCostDelta(
140180
*this, init_font_size, false, glyphs));
141-
if (delta < lowest_delta) {
181+
if (delta >= lowest_delta) {
182+
continue;
183+
}
184+
185+
if (batch_mode) {
186+
// In batch mode we accept any merges under the threshold instead of
187+
// finding the lowest.
188+
if (!glyphs_for_lowest.has_value()) {
189+
glyphs_for_lowest = GlyphSet{};
190+
}
191+
total_delta += delta;
192+
glyphs_for_lowest->union_set(all_glyphs);
193+
} else {
142194
lowest_delta = delta;
143-
glyphs_for_lowest = glyphs;
195+
total_delta = delta;
196+
glyphs_for_lowest = all_glyphs;
144197
}
145198
}
146199

147200
if (!glyphs_for_lowest.has_value()) {
148-
// No more moves to make.
149-
break;
201+
if (batch_mode) {
202+
// Batch mode processing done, move on to non-batch processing.
203+
batch_mode = false;
204+
VLOG(0) << " switching to checking individually.";
205+
continue;
206+
} else {
207+
// No more moves to make.
208+
break;
209+
}
150210
}
151211

152-
TRYV(ApplyInitFontMove(*glyphs_for_lowest, lowest_delta));
212+
TRYV(ApplyInitFontMove(*glyphs_for_lowest, total_delta));
153213
} while (true);
154214

155215
VLOG(0) << "Initial font now has "
@@ -179,10 +239,21 @@ SegmentSet Merger::ComputeCandidateSegments(
179239
SegmentationContext& context, const MergeStrategy& strategy,
180240
const common::SegmentSet& inscope_segments) {
181241
SegmentSet candidate_segments;
182-
for (unsigned i = 0; i < context.SegmentationInfo().Segments().size(); i++) {
183-
if (!context.SegmentationInfo().Segments()[i].Definition().Empty() &&
184-
inscope_segments.contains(i)) {
185-
candidate_segments.insert(i);
242+
243+
if (inscope_segments.size() < context.SegmentationInfo().Segments().size()) {
244+
for (segment_index_t s : inscope_segments) {
245+
if (s < context.SegmentationInfo().Segments().size() &&
246+
!context.SegmentationInfo().Segments()[s].Definition().Empty()) {
247+
candidate_segments.insert(s);
248+
}
249+
}
250+
} else {
251+
for (segment_index_t s = 0;
252+
s < context.SegmentationInfo().Segments().size(); s++) {
253+
if (inscope_segments.contains(s) &&
254+
!context.SegmentationInfo().Segments()[s].Definition().Empty()) {
255+
candidate_segments.insert(s);
256+
}
186257
}
187258
}
188259

ift/encoder/merger.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,13 @@ class Merger {
137137
absl::Status ApplyInitFontMove(const common::GlyphSet& glyphs_to_move,
138138
double delta);
139139

140+
common::SegmentSet InitFontApplyProbabilityThreshold() const;
141+
common::SegmentSet InitFontSegmentsToCheck(
142+
const common::SegmentSet& inscope) const;
143+
absl::btree_map<ActivationCondition, common::GlyphSet>
144+
InitFontConditionsToCheck(const common::SegmentSet& to_check,
145+
bool batch_mode) const;
146+
140147
// Stores the broadeder complete segmentation.
141148
SegmentationContext* context_;
142149

ift/encoder/segmentation_context.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,6 @@ class SegmentationContext {
120120
for (segment_index_t segment_index = 0;
121121
segment_index < SegmentationInfo().Segments().size();
122122
segment_index++) {
123-
// TODO(garretrieger): when using this during the init font move
124-
// processing we know exactly which gids are removed so we can do a
125-
// partial instead of full invalidation.
126123
TRY(ReprocessSegment(segment_index));
127124
}
128125

util/segmenter_config_util.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ StatusOr<MergeStrategy> SegmenterConfigUtil::ProtoToStrategy(
120120
}
121121

122122
if (merged.has_init_font_merge_probability_threshold()) {
123-
strategy.SetInitFontMergeProbabilityThreshold(merged.init_font_merge_probability_threshold());
123+
strategy.SetInitFontMergeProbabilityThreshold(
124+
merged.init_font_merge_probability_threshold());
124125
}
125126

126127
return strategy;

0 commit comments

Comments
 (0)