Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implemented vertical ocr #109

Merged
merged 12 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,7 @@ class VerticalDetector(
Size(modelImageSize.width / 2, modelImageSize.height / 2)
)

var txtThreshold = Constants.TEXT_THRESHOLD

if (!detectSingleCharacter) {
txtThreshold = Constants.TEXT_THRESHOLD_VERTICAL
}

val txtThreshold = if (detectSingleCharacter) Constants.TEXT_THRESHOLD else Constants.TEXT_THRESHOLD_VERTICAL
var bBoxesList = DetectorUtils.getDetBoxesFromTextMapVertical(
scoreText,
scoreLink,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,10 @@ class DetectorUtils {
val detectedBoxes = mutableListOf<OCRbBox>()
for (i in 1 until nLabels) {
val area = stats.get(i, Imgproc.CC_STAT_AREA)[0].toInt()
if (area < 20) continue

val height = stats.get(i, Imgproc.CC_STAT_HEIGHT)[0].toInt()
val width = stats.get(i, Imgproc.CC_STAT_WIDTH)[0].toInt()
if (area < 20) continue

if (!independentCharacters && height < width) continue
val mask = createMaskFromLabels(labels, i)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,16 +256,12 @@ class RecognizerUtils {
img = adjustContrastGrey(img, adjustContrast)
}

var desiredWidth =
if (isVertical) Constants.VERTICAL_SMALL_MODEL_WIDTH else Constants.SMALL_MODEL_WIDTH

if (img.width() >= Constants.LARGE_MODEL_WIDTH) {
desiredWidth = Constants.LARGE_MODEL_WIDTH
} else if (img.width() >= Constants.MEDIUM_MODEL_WIDTH) {
desiredWidth = Constants.MEDIUM_MODEL_WIDTH
val desiredWidth =when {
img.width() >= Constants.LARGE_MODEL_WIDTH -> Constants.LARGE_MODEL_WIDTH
img.width() >= Constants.MEDIUM_MODEL_WIDTH -> Constants.MEDIUM_MODEL_WIDTH
else -> if (isVertical) Constants.VERTICAL_SMALL_MODEL_WIDTH else Constants.SMALL_MODEL_WIDTH
}


img = ImageProcessor.resizeWithPadding(img, desiredWidth, Constants.MODEL_HEIGHT)
img.convertTo(img, CvType.CV_32F, 1.0 / 255.0)
Core.subtract(img, Scalar(0.5), img)
Expand Down
2 changes: 0 additions & 2 deletions ios/RnExecutorch/VerticalOCR.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#import <RnExecutorchSpec/RnExecutorchSpec.h>

constexpr CGFloat recognizerRatio = 1.6;

@interface VerticalOCR : NSObject <NativeVerticalOCRSpec>

@end
18 changes: 1 addition & 17 deletions ios/RnExecutorch/models/ocr/Detector.h
Original file line number Diff line number Diff line change
@@ -1,23 +1,7 @@
#import "BaseModel.h"
#import "RecognitionHandler.h"
#import "opencv2/opencv.hpp"

constexpr CGFloat textThreshold = 0.4;
constexpr CGFloat textThresholdVertical = 0.3;
constexpr CGFloat linkThreshold = 0.4;
constexpr CGFloat lowTextThreshold = 0.7;
constexpr CGFloat centerThreshold = 0.5;
constexpr CGFloat distanceThreshold = 2.0;
constexpr CGFloat heightThreshold = 2.0;
constexpr CGFloat restoreRatio = 3.2;
constexpr CGFloat restoreRatioVertical = 2.0;
constexpr int minSideThreshold = 15;
constexpr int maxSideThreshold = 30;
constexpr int maxWidth = largeModelWidth + (largeModelWidth * 0.15);
constexpr int minSize = 20;

const cv::Scalar mean(0.485, 0.456, 0.406);
const cv::Scalar variance(0.229, 0.224, 0.225);
#import "utils/Constants.h"

@interface Detector : BaseModel

Expand Down
18 changes: 1 addition & 17 deletions ios/RnExecutorch/models/ocr/VerticalDetector.h
Original file line number Diff line number Diff line change
@@ -1,23 +1,7 @@
#import "BaseModel.h"
#import "RecognitionHandler.h"
#import "opencv2/opencv.hpp"

constexpr CGFloat textThreshold = 0.4;
constexpr CGFloat textThresholdVertical = 0.3;
constexpr CGFloat linkThreshold = 0.4;
constexpr CGFloat lowTextThreshold = 0.7;
constexpr CGFloat centerThreshold = 0.5;
constexpr CGFloat distanceThreshold = 2.0;
constexpr CGFloat heightThreshold = 2.0;
constexpr CGFloat restoreRatio = 3.2;
constexpr CGFloat restoreRatioVertical = 2.0;
constexpr int minSideThreshold = 15;
constexpr int maxSideThreshold = 30;
constexpr int maxWidth = largeModelWidth + (largeModelWidth * 0.15);
constexpr int minSize = 20;

const cv::Scalar mean(0.485, 0.456, 0.406);
const cv::Scalar variance(0.229, 0.224, 0.225);
#import "utils/Constants.h"

@interface VerticalDetector : BaseModel

Expand Down
7 changes: 3 additions & 4 deletions ios/RnExecutorch/models/ocr/VerticalDetector.mm
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,9 @@ group each character into a single instance (sequence) Both matrices are
outputMat2:scoreAffinityCV
withSize:cv::Size(modelImageSize.width / 2,
modelImageSize.height / 2)];
CGFloat txtThreshold = textThreshold;
if (!self->detectSingleCharacters) {
txtThreshold = textThresholdVertical;
}
CGFloat txtThreshold = (self->detectSingleCharacters) ? textThreshold
: textThresholdVertical;

NSArray *bBoxesList = [DetectorUtils
getDetBoxesFromTextMapVertical:scoreTextCV
affinityMap:scoreAffinityCV
Expand Down
17 changes: 17 additions & 0 deletions ios/RnExecutorch/models/ocr/utils/Constants.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
constexpr CGFloat textThreshold = 0.4;
constexpr CGFloat textThresholdVertical = 0.3;
constexpr CGFloat linkThreshold = 0.4;
constexpr CGFloat lowTextThreshold = 0.7;
constexpr CGFloat centerThreshold = 0.5;
constexpr CGFloat distanceThreshold = 2.0;
constexpr CGFloat heightThreshold = 2.0;
constexpr CGFloat restoreRatio = 3.2;
constexpr CGFloat restoreRatioVertical = 2.0;
constexpr int minSideThreshold = 15;
constexpr int maxSideThreshold = 30;
constexpr int maxWidth = largeModelWidth + (largeModelWidth * 0.15);
constexpr int minSize = 20;

const cv::Scalar mean(0.485, 0.456, 0.406);
const cv::Scalar variance(0.229, 0.224, 0.225);

4 changes: 2 additions & 2 deletions ios/RnExecutorch/models/ocr/utils/DetectorUtils.mm
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ + (NSArray *)getDetBoxesFromTextMapVertical:(cv::Mat)textMap
NSMutableArray *detectedBoxes = [NSMutableArray array];
for (int i = 1; i < nLabels; i++) {
const int area = stats.at<int>(i, cv::CC_STAT_AREA);
const int width = stats.at<int>(i, cv::CC_STAT_WIDTH);
const int height = stats.at<int>(i, cv::CC_STAT_HEIGHT);
if (area < 20)
continue;
const int width = stats.at<int>(i, cv::CC_STAT_WIDTH);
const int height = stats.at<int>(i, cv::CC_STAT_HEIGHT);

if (!independentCharacters && height < width)
continue;
Expand Down
1 change: 1 addition & 0 deletions ios/RnExecutorch/models/ocr/utils/OCRUtils.mm
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ @implementation OCRUtils

+ (cv::Rect)extractBoundingBox:(NSArray *)coords {
std::vector<cv::Point2f> points;
points.reserve(coords.count);
for (NSValue *value in coords) {
const CGPoint point = [value CGPointValue];

Expand Down
11 changes: 4 additions & 7 deletions ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,8 @@ + (CGFloat)calculateRatio:(int)width height:(int)height {
image = [self adjustContrastGrey:image target:adjustContrast];
}

int desiredWidth;
if (isVertical){
desiredWidth = 64;
}else{
desiredWidth = 128;
}
int desiredWidth = (isVertical) ? 64 : 128;

if (image.cols >= 512) {
desiredWidth = 512;
} else if (image.cols >= 256) {
Expand Down Expand Up @@ -233,6 +229,7 @@ + (double)computeConfidenceScore:(NSArray<NSNumber *> *)valuesArray
originalPaddings:(NSDictionary *)originalPaddings {
CGPoint topLeft = [originalBbox[0] CGPointValue];
std::vector<cv::Point2f> points;
points.reserve(bbox.count);
for (NSValue *coords in bbox) {
CGPoint point = [coords CGPointValue];

Expand All @@ -251,7 +248,7 @@ + (double)computeConfidenceScore:(NSArray<NSNumber *> *)valuesArray
point.x = point.x * [originalPaddings[@"resizeRatio"] floatValue];
point.y = point.y * [originalPaddings[@"resizeRatio"] floatValue];

points.push_back(cv::Point2f(point.x, point.y));
points.emplace_back(cv::Point2f(point.x, point.y));
}

cv::Rect rect = cv::boundingRect(points);
Expand Down
4 changes: 0 additions & 4 deletions src/constants/ocr/languageDicts.ts

This file was deleted.

3 changes: 1 addition & 2 deletions src/hooks/computer_vision/useOCR.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { useEffect, useState } from 'react';
import { fetchResource } from '../../utils/fetchResource';
import { languageDicts } from '../../constants/ocr/languageDicts';
import { symbols } from '../../constants/ocr/symbols';
import { getError, ETError } from '../../Error';
import { OCR } from '../../native/RnExecutorchModules';
Expand Down Expand Up @@ -45,7 +44,7 @@ export const useOCR = ({
recognizerSmall: string;
};

if (!symbols[language] || !languageDicts[language]) {
if (!symbols[language]) {
setError(getError(ETError.LanguageNotSupported));
return;
}
Expand Down
41 changes: 14 additions & 27 deletions src/hooks/computer_vision/useVerticalOCR.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { useEffect, useState } from 'react';
import { fetchResource } from '../../utils/fetchResource';
import { languageDicts } from '../../constants/ocr/languageDicts';
import { symbols } from '../../constants/ocr/symbols';
import { getError, ETError } from '../../Error';
import { VerticalOCR } from '../../native/RnExecutorchModules';
Expand Down Expand Up @@ -46,37 +45,25 @@ export const useVerticalOCR = ({
)
return;

let recognizerPath;

const detectorPaths = {} as {
detectorLarge: string;
detectorNarrow: string;
};

if (!symbols[language] || !languageDicts[language]) {
if (!symbols[language]) {
setError(getError(ETError.LanguageNotSupported));
return;
}

await Promise.all([
fetchResource(detectorSources.detectorLarge),
fetchResource(detectorSources.detectorNarrow),
]).then((values) => {
detectorPaths.detectorLarge = values[0];
detectorPaths.detectorNarrow = values[1];
});
const recognizerPath = independentCharacters
? await fetchResource(
recognizerSources.recognizerSmall,
setDownloadProgress
)
: await fetchResource(
recognizerSources.recognizerLarge,
setDownloadProgress
);

if (independentCharacters) {
recognizerPath = await fetchResource(
recognizerSources.recognizerSmall,
setDownloadProgress
);
} else {
recognizerPath = await fetchResource(
recognizerSources.recognizerLarge,
setDownloadProgress
);
}
const detectorPaths = {
detectorLarge: await fetchResource(detectorSources.detectorLarge),
detectorNarrow: await fetchResource(detectorSources.detectorNarrow),
};

setIsReady(false);
await VerticalOCR.loadModule(
Expand Down
3 changes: 1 addition & 2 deletions src/modules/computer_vision/OCRModule.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { languageDicts } from '../../constants/ocr/languageDicts';
import { symbols } from '../../constants/ocr/symbols';
import { getError, ETError } from '../../Error';
import { OCR } from '../../native/RnExecutorchModules';
Expand Down Expand Up @@ -27,7 +26,7 @@ export class OCRModule {
recognizerSmall: string;
};

if (!symbols[language] || !languageDicts[language]) {
if (!symbols[language]) {
throw new Error(getError(ETError.LanguageNotSupported));
}

Expand Down
Loading