From 691b6fabf38a250836884f83be8f1466ff9d4fc1 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Fri, 7 Mar 2025 12:11:09 +0100 Subject: [PATCH 1/3] fix: fix bug in ocrs, remove node_modules from cv example app --- .../com/swmansion/rnexecutorch/VerticalOCR.kt | 2 +- .../models/ocr/RecognitionHandler.kt | 2 +- .../models/ocr/VerticalDetector.h | 28 ----- .../models/ocr/VerticalDetector.mm | 118 ------------------ ios/RnExecutorch/OCR.mm | 14 +-- ios/RnExecutorch/VerticalOCR.mm | 1 + ios/RnExecutorch/models/ocr/Detector.mm | 2 +- ios/RnExecutorch/models/ocr/Recognizer.mm | 8 +- .../models/ocr/utils/RecognizerUtils.mm | 10 +- 9 files changed, 19 insertions(+), 166 deletions(-) delete mode 100644 examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h delete mode 100644 examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm diff --git a/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt b/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt index 2d800677..1b09d8f8 100644 --- a/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt +++ b/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt @@ -155,7 +155,7 @@ class VerticalOCR(reactContext: ReactApplicationContext) : resMap.putString("text", text) resMap.putArray("bbox", box.toWritableArray()) - resMap.putDouble("confidence", confidenceScore) + resMap.putDouble("score", confidenceScore) predictions.pushMap(resMap) } diff --git a/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt b/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt index 90fd6128..451445d8 100644 --- a/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt +++ b/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt @@ -104,7 +104,7 @@ class RecognitionHandler( resMap.putString("text", decodedTexts[0]) resMap.putArray("bbox", box.toWritableArray()) - resMap.putDouble("confidence", confidenceScore) + resMap.putDouble("score", confidenceScore) res.pushMap(resMap) } diff --git a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h b/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h deleted file mode 100644 index 8263ddd4..00000000 --- a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h +++ /dev/null @@ -1,28 +0,0 @@ -#import "BaseModel.h" -#import "RecognitionHandler.h" -#import "opencv2/opencv.hpp" - -constexpr CGFloat textThreshold = 0.4; -constexpr CGFloat textThresholdVertical = 0.3; -constexpr CGFloat linkThreshold = 0.4; -constexpr CGFloat lowTextThreshold = 0.7; -constexpr CGFloat centerThreshold = 0.5; -constexpr CGFloat distanceThreshold = 2.0; -constexpr CGFloat heightThreshold = 2.0; -constexpr CGFloat restoreRatio = 3.2; -constexpr CGFloat restoreRatioVertical = 2.0; -constexpr int minSideThreshold = 15; -constexpr int maxSideThreshold = 30; -constexpr int maxWidth = largeModelWidth + (largeModelWidth * 0.15); -constexpr int minSize = 20; - -const cv::Scalar mean(0.485, 0.456, 0.406); -const cv::Scalar variance(0.229, 0.224, 0.225); - -@interface VerticalDetector : BaseModel - -- (instancetype)initWithDetectSingleCharacters:(BOOL)detectSingleCharacters; -- (cv::Size)getModelImageSize; -- (NSArray *)runModel:(cv::Mat &)input; - -@end diff --git a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm b/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm deleted file mode 100644 index a2657a00..00000000 --- a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm +++ /dev/null @@ -1,118 +0,0 @@ -#import "VerticalDetector.h" -#import "../../utils/ImageProcessor.h" -#import "utils/DetectorUtils.h" -#import "utils/OCRUtils.h" - -/* - The model used as detector is based on CRAFT (Character Region Awareness for - Text Detection) paper. https://arxiv.org/pdf/1904.01941 - */ - -@implementation VerticalDetector { - cv::Size originalSize; - cv::Size modelSize; - BOOL detectSingleCharacters; -} - -- (instancetype)initWithDetectSingleCharacters:(BOOL)detectSingleCharacters { - self = [super init]; - if (self) { - self->detectSingleCharacters = detectSingleCharacters; - } - return self; -} - -- (cv::Size)getModelImageSize { - if (!modelSize.empty()) { - return modelSize; - } - - NSArray *inputShape = [module getInputShape:@0]; - NSNumber *widthNumber = inputShape[inputShape.count - 2]; - NSNumber *heightNumber = inputShape.lastObject; - - const int height = [heightNumber intValue]; - const int width = [widthNumber intValue]; - modelSize = cv::Size(height, width); - - return cv::Size(height, width); -} - -- (NSArray *)preprocess:(cv::Mat &)input { - /* - Detector as an input accepts tensor with a shape of [1, 3, 800, 800]. - Due to big influence of resize to quality of recognition the image preserves - original aspect ratio and the missing parts are filled with padding. - */ - self->originalSize = cv::Size(input.cols, input.rows); - cv::Size modelImageSize = [self getModelImageSize]; - cv::Mat resizedImage; - resizedImage = [OCRUtils resizeWithPadding:input - desiredWidth:modelImageSize.width - desiredHeight:modelImageSize.height]; - NSArray *modelInput = [ImageProcessor matToNSArray:resizedImage - mean:mean - variance:variance]; - return modelInput; -} - -- (NSArray *)postprocess:(NSArray *)output { - /* - The output of the model consists of two matrices (heat maps): - 1. ScoreText(Score map) - The probability of a region containing character - 2. ScoreAffinity(Affinity map) - affinity between characters, used to to - group each character into a single instance (sequence) Both matrices are - 400x400 - - The result of this step is a list of bounding boxes that contain text. - */ - NSArray *predictions = [output objectAtIndex:0]; - - cv::Size modelImageSize = [self getModelImageSize]; - cv::Mat scoreTextCV, scoreAffinityCV; - /* - The output of the model is a matrix in size of input image containing two - matrices representing heatmap. Those two matrices are in the size of half of - the input image, that's why the width and height is divided by 2. - */ - [DetectorUtils interleavedArrayToMats:predictions - outputMat1:scoreTextCV - outputMat2:scoreAffinityCV - withSize:cv::Size(modelImageSize.width / 2, - modelImageSize.height / 2)]; - CGFloat txtThreshold = textThreshold; - if (!self->detectSingleCharacters) { - txtThreshold = textThresholdVertical; - } - NSArray *bBoxesList = [DetectorUtils - getDetBoxesFromTextMapVertical:scoreTextCV - affinityMap:scoreAffinityCV - usingTextThreshold:txtThreshold - linkThreshold:linkThreshold - independentCharacters:self->detectSingleCharacters]; - bBoxesList = [DetectorUtils restoreBboxRatio:bBoxesList - usingRestoreRatio:restoreRatioVertical]; - - if (self->detectSingleCharacters) { - return bBoxesList; - } - - bBoxesList = [DetectorUtils groupTextBoxes:bBoxesList - centerThreshold:centerThreshold - distanceThreshold:distanceThreshold - heightThreshold:heightThreshold - minSideThreshold:minSideThreshold - maxSideThreshold:maxSideThreshold - maxWidth:maxWidth]; - - return bBoxesList; -} - -- (NSArray *)runModel:(cv::Mat &)input { - NSArray *modelInput = [self preprocess:input]; - NSArray *modelResult = [self forward:modelInput]; - NSArray *result = [self postprocess:modelResult]; - return result; -} - -@end diff --git a/ios/RnExecutorch/OCR.mm b/ios/RnExecutorch/OCR.mm index 509e3876..b2ac7993 100644 --- a/ios/RnExecutorch/OCR.mm +++ b/ios/RnExecutorch/OCR.mm @@ -1,8 +1,8 @@ #import "OCR.h" #import "models/ocr/Detector.h" #import "models/ocr/RecognitionHandler.h" -#import "utils/ImageProcessor.h" #import "models/ocr/utils/Constants.h" +#import "utils/ImageProcessor.h" #import #import @@ -80,16 +80,14 @@ of different sizes (e.g. large - 512x64, medium - 256x64, small - 128x64). @try { cv::Mat image = [ImageProcessor readImage:input]; NSArray *result = [detector runModel:image]; - cv::Size detectorSize = [detector getModelImageSize]; - const CGFloat recognizerRatio = recognizerImageSize / detectorSize.width; cv::cvtColor(image, image, cv::COLOR_BGR2GRAY); - result = [self->recognitionHandler - recognize:result - imgGray:image - desiredWidth:detectorSize.width * recognizerRatio - desiredHeight:detectorSize.height * recognizerRatio]; + result = [self->recognitionHandler recognize:result + imgGray:image + desiredWidth:recognizerImageSize + desiredHeight:recognizerImageSize]; resolve(result); } @catch (NSException *exception) { + NSLog(@"%@", exception.reason); reject(@"forward_error", [NSString stringWithFormat:@"%@", exception.reason], nil); } diff --git a/ios/RnExecutorch/VerticalOCR.mm b/ios/RnExecutorch/VerticalOCR.mm index ef5e58a2..bc116e10 100644 --- a/ios/RnExecutorch/VerticalOCR.mm +++ b/ios/RnExecutorch/VerticalOCR.mm @@ -104,6 +104,7 @@ - (void)forward:(NSString *)input text = [text stringByAppendingString:decodedText[0]]; confidenceScore = @([confidenceScore floatValue] + [[recognitionResult objectAtIndex:1] floatValue]); }else{ + NSLog(@"width: %d, height: %d", croppedCharacter.cols, croppedCharacter.rows); croppedCharacters.push_back(croppedCharacter); } } diff --git a/ios/RnExecutorch/models/ocr/Detector.mm b/ios/RnExecutorch/models/ocr/Detector.mm index 5bec8836..bca0f50c 100644 --- a/ios/RnExecutorch/models/ocr/Detector.mm +++ b/ios/RnExecutorch/models/ocr/Detector.mm @@ -21,7 +21,7 @@ @implementation Detector { NSArray *inputShape = [module getInputShape:@0]; NSNumber *widthNumber = inputShape[inputShape.count - 2]; NSNumber *heightNumber = inputShape.lastObject; - + const int height = [heightNumber intValue]; const int width = [widthNumber intValue]; modelSize = cv::Size(height, width); diff --git a/ios/RnExecutorch/models/ocr/Recognizer.mm b/ios/RnExecutorch/models/ocr/Recognizer.mm index e3ee9089..8b339bc2 100644 --- a/ios/RnExecutorch/models/ocr/Recognizer.mm +++ b/ios/RnExecutorch/models/ocr/Recognizer.mm @@ -14,8 +14,8 @@ @implementation Recognizer { - (cv::Size)getModelImageSize { NSArray *inputShape = [module getInputShape:@0]; - NSNumber *widthNumber = inputShape[inputShape.count - 2]; - NSNumber *heightNumber = inputShape.lastObject; + NSNumber *widthNumber = inputShape.lastObject; + NSNumber *heightNumber = inputShape[inputShape.count - 2]; const int height = [heightNumber intValue]; const int width = [widthNumber intValue]; @@ -24,8 +24,8 @@ @implementation Recognizer { - (cv::Size)getModelOutputSize { NSArray *outputShape = [module getOutputShape:@0]; - NSNumber *widthNumber = outputShape[outputShape.count - 2]; - NSNumber *heightNumber = outputShape.lastObject; + NSNumber *widthNumber = outputShape.lastObject; + NSNumber *heightNumber = outputShape[outputShape.count - 2]; const int height = [heightNumber intValue]; const int width = [widthNumber intValue]; diff --git a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm index 1908ad6f..f69e6756 100644 --- a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm +++ b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm @@ -251,17 +251,17 @@ + (double)computeConfidenceScore:(NSArray *)valuesArray points.emplace_back(cv::Point2f(point.x, point.y)); } - + cv::Rect rect = cv::boundingRect(points); cv::Mat croppedImage = img(rect); + cv::cvtColor(croppedImage, croppedImage, cv::COLOR_BGR2GRAY); + cv::resize(croppedImage, croppedImage, cv::Size(smallVerticalRecognizerWidth, recognizerHeight), 0, 0, + cv::INTER_AREA); + cv::medianBlur(img, img, 1); return croppedImage; } + (cv::Mat)cropSingleCharacter:(cv::Mat)img { - cv::cvtColor(img, img, cv::COLOR_BGR2GRAY); - cv::resize(img, img, cv::Size(smallVerticalRecognizerWidth, recognizerHeight), 0, 0, - cv::INTER_AREA); - cv::medianBlur(img, img, 1); cv::Mat histogram; From 951c50b911216f15fb7afb23f8aaa74665631993 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Fri, 7 Mar 2025 12:12:43 +0100 Subject: [PATCH 2/3] refactor: remove NSLogs --- ios/RnExecutorch/OCR.mm | 1 - ios/RnExecutorch/VerticalOCR.mm | 1 - 2 files changed, 2 deletions(-) diff --git a/ios/RnExecutorch/OCR.mm b/ios/RnExecutorch/OCR.mm index b2ac7993..bdff6127 100644 --- a/ios/RnExecutorch/OCR.mm +++ b/ios/RnExecutorch/OCR.mm @@ -87,7 +87,6 @@ of different sizes (e.g. large - 512x64, medium - 256x64, small - 128x64). desiredHeight:recognizerImageSize]; resolve(result); } @catch (NSException *exception) { - NSLog(@"%@", exception.reason); reject(@"forward_error", [NSString stringWithFormat:@"%@", exception.reason], nil); } diff --git a/ios/RnExecutorch/VerticalOCR.mm b/ios/RnExecutorch/VerticalOCR.mm index bc116e10..ef5e58a2 100644 --- a/ios/RnExecutorch/VerticalOCR.mm +++ b/ios/RnExecutorch/VerticalOCR.mm @@ -104,7 +104,6 @@ - (void)forward:(NSString *)input text = [text stringByAppendingString:decodedText[0]]; confidenceScore = @([confidenceScore floatValue] + [[recognitionResult objectAtIndex:1] floatValue]); }else{ - NSLog(@"width: %d, height: %d", croppedCharacter.cols, croppedCharacter.rows); croppedCharacters.push_back(croppedCharacter); } } From 0142162bb8afcf877b96944f308d3bcae064574b Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Fri, 7 Mar 2025 13:47:37 +0100 Subject: [PATCH 3/3] chore: add audio api to example apps --- examples/computer-vision/package.json | 1 + examples/computer-vision/yarn.lock | 13 +++++++++++++ examples/llama/package.json | 1 + examples/llama/yarn.lock | 13 +++++++++++++ 4 files changed, 28 insertions(+) diff --git a/examples/computer-vision/package.json b/examples/computer-vision/package.json index 7f39a19e..c25a6069 100644 --- a/examples/computer-vision/package.json +++ b/examples/computer-vision/package.json @@ -16,6 +16,7 @@ "metro-config": "^0.81.0", "react": "18.3.1", "react-native": "0.76.3", + "react-native-audio-api": "^0.4.13", "react-native-executorch": "^0.3.0", "react-native-image-picker": "^7.2.2", "react-native-loading-spinner-overlay": "^3.0.1", diff --git a/examples/computer-vision/yarn.lock b/examples/computer-vision/yarn.lock index 8f2c20fe..655cd2cf 100644 --- a/examples/computer-vision/yarn.lock +++ b/examples/computer-vision/yarn.lock @@ -3889,6 +3889,7 @@ __metadata: metro-config: ^0.81.0 react: 18.3.1 react-native: 0.76.3 + react-native-audio-api: ^0.4.13 react-native-executorch: ^0.3.0 react-native-image-picker: ^7.2.2 react-native-loading-spinner-overlay: ^3.0.1 @@ -7486,6 +7487,18 @@ __metadata: languageName: node linkType: hard +"react-native-audio-api@npm:^0.4.13": + version: 0.4.13 + resolution: "react-native-audio-api@npm:0.4.13" + peerDependencies: + react: "*" + react-native: "*" + bin: + setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js + checksum: 9a5db4626a0663224cdcdc957ed5176df9ed65df9f58bf06c5eae2a91895b7ee0f0a632ecaa79b23ef7536983485cea2720ca46201e2e690548f400b66970ff6 + languageName: node + linkType: hard + "react-native-executorch@npm:^0.3.0": version: 0.3.0 resolution: "react-native-executorch@npm:0.3.0" diff --git a/examples/llama/package.json b/examples/llama/package.json index 6abbb086..022d7afb 100644 --- a/examples/llama/package.json +++ b/examples/llama/package.json @@ -16,6 +16,7 @@ "metro-config": "^0.81.0", "react": "18.3.1", "react-native": "0.76.3", + "react-native-audio-api": "^0.4.13", "react-native-executorch": "^0.3.0", "react-native-loading-spinner-overlay": "^3.0.1", "react-native-markdown-display": "^7.0.2", diff --git a/examples/llama/yarn.lock b/examples/llama/yarn.lock index 4b6a4516..1059fc38 100644 --- a/examples/llama/yarn.lock +++ b/examples/llama/yarn.lock @@ -6074,6 +6074,7 @@ __metadata: metro-config: ^0.81.0 react: 18.3.1 react-native: 0.76.3 + react-native-audio-api: ^0.4.13 react-native-executorch: ^0.3.0 react-native-loading-spinner-overlay: ^3.0.1 react-native-markdown-display: ^7.0.2 @@ -7555,6 +7556,18 @@ __metadata: languageName: node linkType: hard +"react-native-audio-api@npm:^0.4.13": + version: 0.4.13 + resolution: "react-native-audio-api@npm:0.4.13" + peerDependencies: + react: "*" + react-native: "*" + bin: + setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js + checksum: 9a5db4626a0663224cdcdc957ed5176df9ed65df9f58bf06c5eae2a91895b7ee0f0a632ecaa79b23ef7536983485cea2720ca46201e2e690548f400b66970ff6 + languageName: node + linkType: hard + "react-native-executorch@npm:^0.3.0": version: 0.3.0 resolution: "react-native-executorch@npm:0.3.0"