Skip to content

Commit d6d33c4

Browse files
committed
fix: Merge only blocks of text with the same style into the same paragraph.
1 parent 031a856 commit d6d33c4

File tree

5 files changed

+306
-283
lines changed

5 files changed

+306
-283
lines changed

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/ParagraphProcessor.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.verapdf.wcag.algorithms.entities.enums.TextAlignment;
1717
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.CaptionUtils;
1818
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.ChunksMergeUtils;
19+
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.NodeUtils;
1920

2021
import java.util.ArrayList;
2122
import java.util.Iterator;
@@ -302,6 +303,9 @@ private static List<TextBlock> processOtherLines(List<TextBlock> textBlocks) {
302303
}
303304

304305
private static boolean isOneParagraph(TextBlock previousBlock, TextBlock nextBlock) {
306+
if (!areCloseStyle(previousBlock, nextBlock)) {
307+
return false;
308+
}
305309
double probability = getDifferentLinesProbability(previousBlock, nextBlock);
306310
return CaptionUtils.areOverlapping(previousBlock.getLastLine(), nextBlock.getFirstLine().getBoundingBox()) &&
307311
probability > DIFFERENT_LINES_PROBABILITY &&
@@ -348,4 +352,10 @@ private static double getDifferentLinesProbability(TextBlock previousBlock, Text
348352
}
349353
return 0;
350354
}
355+
356+
private static boolean areCloseStyle(TextBlock previousBlock, TextBlock nextBlock) {
357+
return NodeUtils.areCloseNumbers(previousBlock.getFontSize(), nextBlock.getFontSize(), 1e-1) &&
358+
NodeUtils.areCloseNumbers(previousBlock.getFirstLine().getFirstTextChunk().getFontWeight(),
359+
nextBlock.getFirstLine().getFirstTextChunk().getFontWeight(), 1e-1);
360+
}
351361
}

resources/1901.03003.html

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ <h3>Abstract</h3>
1515

1616
<p>Keywords: Scene text recognition, optical character recognition, deep learning.</p>
1717

18-
<p>∗Corresponding author 1https://github.com/Canjie-Luo/MORAN_v2</p>
18+
<p>∗Corresponding author</p>
19+
20+
<p>1https://github.com/Canjie-Luo/MORAN_v2</p>
1921

2022
<img src="1901.03003_figures/figure1.png" alt="figure1">
2123

0 commit comments

Comments
 (0)