From 691b6fabf38a250836884f83be8f1466ff9d4fc1 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Fri, 7 Mar 2025 12:11:09 +0100
Subject: [PATCH 1/3] fix: fix bug in ocrs, remove node_modules from cv example
 app

---
 .../com/swmansion/rnexecutorch/VerticalOCR.kt |   2 +-
 .../models/ocr/RecognitionHandler.kt          |   2 +-
 .../models/ocr/VerticalDetector.h             |  28 -----
 .../models/ocr/VerticalDetector.mm            | 118 ------------------
 ios/RnExecutorch/OCR.mm                       |  14 +--
 ios/RnExecutorch/VerticalOCR.mm               |   1 +
 ios/RnExecutorch/models/ocr/Detector.mm       |   2 +-
 ios/RnExecutorch/models/ocr/Recognizer.mm     |   8 +-
 .../models/ocr/utils/RecognizerUtils.mm       |  10 +-
 9 files changed, 19 insertions(+), 166 deletions(-)
 delete mode 100644 examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h
 delete mode 100644 examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm

diff --git a/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt b/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt
index 2d8006774..1b09d8f86 100644
--- a/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt
+++ b/android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt
@@ -155,7 +155,7 @@ class VerticalOCR(reactContext: ReactApplicationContext) :
 
         resMap.putString("text", text)
         resMap.putArray("bbox", box.toWritableArray())
-        resMap.putDouble("confidence", confidenceScore)
+        resMap.putDouble("score", confidenceScore)
 
         predictions.pushMap(resMap)
       }
diff --git a/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt b/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt
index 90fd61280..451445d8d 100644
--- a/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt
+++ b/android/src/main/java/com/swmansion/rnexecutorch/models/ocr/RecognitionHandler.kt
@@ -104,7 +104,7 @@ class RecognitionHandler(
 
       resMap.putString("text", decodedTexts[0])
       resMap.putArray("bbox", box.toWritableArray())
-      resMap.putDouble("confidence", confidenceScore)
+      resMap.putDouble("score", confidenceScore)
 
       res.pushMap(resMap)
     }
diff --git a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h b/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h
deleted file mode 100644
index 8263ddd4e..000000000
--- a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#import "BaseModel.h"
-#import "RecognitionHandler.h"
-#import "opencv2/opencv.hpp"
-
-constexpr CGFloat textThreshold = 0.4;
-constexpr CGFloat textThresholdVertical = 0.3;
-constexpr CGFloat linkThreshold = 0.4;
-constexpr CGFloat lowTextThreshold = 0.7;
-constexpr CGFloat centerThreshold = 0.5;
-constexpr CGFloat distanceThreshold = 2.0;
-constexpr CGFloat heightThreshold = 2.0;
-constexpr CGFloat restoreRatio = 3.2;
-constexpr CGFloat restoreRatioVertical = 2.0;
-constexpr int minSideThreshold = 15;
-constexpr int maxSideThreshold = 30;
-constexpr int maxWidth = largeModelWidth + (largeModelWidth * 0.15);
-constexpr int minSize = 20;
-
-const cv::Scalar mean(0.485, 0.456, 0.406);
-const cv::Scalar variance(0.229, 0.224, 0.225);
-
-@interface VerticalDetector : BaseModel
-
-- (instancetype)initWithDetectSingleCharacters:(BOOL)detectSingleCharacters;
-- (cv::Size)getModelImageSize;
-- (NSArray *)runModel:(cv::Mat &)input;
-
-@end
diff --git a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm b/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm
deleted file mode 100644
index a2657a00c..000000000
--- a/examples/computer-vision/node_modules/react-native-executorch/ios/RnExecutorch/models/ocr/VerticalDetector.mm
+++ /dev/null
@@ -1,118 +0,0 @@
-#import "VerticalDetector.h"
-#import "../../utils/ImageProcessor.h"
-#import "utils/DetectorUtils.h"
-#import "utils/OCRUtils.h"
-
-/*
- The model used as detector is based on CRAFT (Character Region Awareness for
- Text Detection) paper. https://arxiv.org/pdf/1904.01941
- */
-
-@implementation VerticalDetector {
-  cv::Size originalSize;
-  cv::Size modelSize;
-  BOOL detectSingleCharacters;
-}
-
-- (instancetype)initWithDetectSingleCharacters:(BOOL)detectSingleCharacters {
-  self = [super init];
-  if (self) {
-    self->detectSingleCharacters = detectSingleCharacters;
-  }
-  return self;
-}
-
-- (cv::Size)getModelImageSize {
-  if (!modelSize.empty()) {
-    return modelSize;
-  }
-
-  NSArray *inputShape = [module getInputShape:@0];
-  NSNumber *widthNumber = inputShape[inputShape.count - 2];
-  NSNumber *heightNumber = inputShape.lastObject;
-
-  const int height = [heightNumber intValue];
-  const int width = [widthNumber intValue];
-  modelSize = cv::Size(height, width);
-
-  return cv::Size(height, width);
-}
-
-- (NSArray *)preprocess:(cv::Mat &)input {
-  /*
-   Detector as an input accepts tensor with a shape of [1, 3, 800, 800].
-   Due to big influence of resize to quality of recognition the image preserves
-   original aspect ratio and the missing parts are filled with padding.
-   */
-  self->originalSize = cv::Size(input.cols, input.rows);
-  cv::Size modelImageSize = [self getModelImageSize];
-  cv::Mat resizedImage;
-  resizedImage = [OCRUtils resizeWithPadding:input
-                                desiredWidth:modelImageSize.width
-                               desiredHeight:modelImageSize.height];
-  NSArray *modelInput = [ImageProcessor matToNSArray:resizedImage
-                                                mean:mean
-                                            variance:variance];
-  return modelInput;
-}
-
-- (NSArray *)postprocess:(NSArray *)output {
-  /*
-   The output of the model consists of two matrices (heat maps):
-   1. ScoreText(Score map) - The probability of a region containing character
-   2. ScoreAffinity(Affinity map) - affinity between characters, used to to
-   group each character into a single instance (sequence) Both matrices are
-   400x400
-
-   The result of this step is a list of bounding boxes that contain text.
-   */
-  NSArray *predictions = [output objectAtIndex:0];
-
-  cv::Size modelImageSize = [self getModelImageSize];
-  cv::Mat scoreTextCV, scoreAffinityCV;
-  /*
-   The output of the model is a matrix in size of input image containing two
-   matrices representing heatmap. Those two matrices are in the size of half of
-   the input  image, that's why the width and height is divided by 2.
-   */
-  [DetectorUtils interleavedArrayToMats:predictions
-                             outputMat1:scoreTextCV
-                             outputMat2:scoreAffinityCV
-                               withSize:cv::Size(modelImageSize.width / 2,
-                                                 modelImageSize.height / 2)];
-  CGFloat txtThreshold = textThreshold;
-  if (!self->detectSingleCharacters) {
-    txtThreshold = textThresholdVertical;
-  }
-  NSArray *bBoxesList = [DetectorUtils
-      getDetBoxesFromTextMapVertical:scoreTextCV
-                         affinityMap:scoreAffinityCV
-                  usingTextThreshold:txtThreshold
-                       linkThreshold:linkThreshold
-               independentCharacters:self->detectSingleCharacters];
-  bBoxesList = [DetectorUtils restoreBboxRatio:bBoxesList
-                             usingRestoreRatio:restoreRatioVertical];
-
-  if (self->detectSingleCharacters) {
-    return bBoxesList;
-  }
-
-  bBoxesList = [DetectorUtils groupTextBoxes:bBoxesList
-                             centerThreshold:centerThreshold
-                           distanceThreshold:distanceThreshold
-                             heightThreshold:heightThreshold
-                            minSideThreshold:minSideThreshold
-                            maxSideThreshold:maxSideThreshold
-                                    maxWidth:maxWidth];
-
-  return bBoxesList;
-}
-
-- (NSArray *)runModel:(cv::Mat &)input {
-  NSArray *modelInput = [self preprocess:input];
-  NSArray *modelResult = [self forward:modelInput];
-  NSArray *result = [self postprocess:modelResult];
-  return result;
-}
-
-@end
diff --git a/ios/RnExecutorch/OCR.mm b/ios/RnExecutorch/OCR.mm
index 509e38765..b2ac79934 100644
--- a/ios/RnExecutorch/OCR.mm
+++ b/ios/RnExecutorch/OCR.mm
@@ -1,8 +1,8 @@
 #import "OCR.h"
 #import "models/ocr/Detector.h"
 #import "models/ocr/RecognitionHandler.h"
-#import "utils/ImageProcessor.h"
 #import "models/ocr/utils/Constants.h"
+#import "utils/ImageProcessor.h"
 #import <ExecutorchLib/ETModel.h>
 #import <React/RCTBridgeModule.h>
 
@@ -80,16 +80,14 @@ of different sizes (e.g. large - 512x64, medium - 256x64, small - 128x64).
   @try {
     cv::Mat image = [ImageProcessor readImage:input];
     NSArray *result = [detector runModel:image];
-    cv::Size detectorSize = [detector getModelImageSize];
-    const CGFloat recognizerRatio = recognizerImageSize / detectorSize.width;
     cv::cvtColor(image, image, cv::COLOR_BGR2GRAY);
-    result = [self->recognitionHandler
-            recognize:result
-              imgGray:image
-         desiredWidth:detectorSize.width * recognizerRatio
-        desiredHeight:detectorSize.height * recognizerRatio];
+    result = [self->recognitionHandler recognize:result
+                                         imgGray:image
+                                    desiredWidth:recognizerImageSize
+                                   desiredHeight:recognizerImageSize];
     resolve(result);
   } @catch (NSException *exception) {
+    NSLog(@"%@", exception.reason);
     reject(@"forward_error",
            [NSString stringWithFormat:@"%@", exception.reason], nil);
   }
diff --git a/ios/RnExecutorch/VerticalOCR.mm b/ios/RnExecutorch/VerticalOCR.mm
index ef5e58a2b..bc116e102 100644
--- a/ios/RnExecutorch/VerticalOCR.mm
+++ b/ios/RnExecutorch/VerticalOCR.mm
@@ -104,6 +104,7 @@ - (void)forward:(NSString *)input
           text = [text stringByAppendingString:decodedText[0]];
           confidenceScore = @([confidenceScore floatValue] + [[recognitionResult objectAtIndex:1] floatValue]);
         }else{
+          NSLog(@"width: %d, height: %d", croppedCharacter.cols, croppedCharacter.rows);
           croppedCharacters.push_back(croppedCharacter);
         }
       }
diff --git a/ios/RnExecutorch/models/ocr/Detector.mm b/ios/RnExecutorch/models/ocr/Detector.mm
index 5bec88369..bca0f50c7 100644
--- a/ios/RnExecutorch/models/ocr/Detector.mm
+++ b/ios/RnExecutorch/models/ocr/Detector.mm
@@ -21,7 +21,7 @@ @implementation Detector {
   NSArray *inputShape = [module getInputShape:@0];
   NSNumber *widthNumber = inputShape[inputShape.count - 2];
   NSNumber *heightNumber = inputShape.lastObject;
-
+  
   const int height = [heightNumber intValue];
   const int width = [widthNumber intValue];
   modelSize = cv::Size(height, width);
diff --git a/ios/RnExecutorch/models/ocr/Recognizer.mm b/ios/RnExecutorch/models/ocr/Recognizer.mm
index e3ee90898..8b339bc23 100644
--- a/ios/RnExecutorch/models/ocr/Recognizer.mm
+++ b/ios/RnExecutorch/models/ocr/Recognizer.mm
@@ -14,8 +14,8 @@ @implementation Recognizer {
 
 - (cv::Size)getModelImageSize {
   NSArray *inputShape = [module getInputShape:@0];
-  NSNumber *widthNumber = inputShape[inputShape.count - 2];
-  NSNumber *heightNumber = inputShape.lastObject;
+  NSNumber *widthNumber = inputShape.lastObject;
+  NSNumber *heightNumber = inputShape[inputShape.count - 2];
 
   const int height = [heightNumber intValue];
   const int width = [widthNumber intValue];
@@ -24,8 +24,8 @@ @implementation Recognizer {
 
 - (cv::Size)getModelOutputSize {
   NSArray *outputShape = [module getOutputShape:@0];
-  NSNumber *widthNumber = outputShape[outputShape.count - 2];
-  NSNumber *heightNumber = outputShape.lastObject;
+  NSNumber *widthNumber = outputShape.lastObject;
+  NSNumber *heightNumber = outputShape[outputShape.count - 2];
 
   const int height = [heightNumber intValue];
   const int width = [widthNumber intValue];
diff --git a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm
index 1908ad6f9..f69e67565 100644
--- a/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm
+++ b/ios/RnExecutorch/models/ocr/utils/RecognizerUtils.mm
@@ -251,17 +251,17 @@ + (double)computeConfidenceScore:(NSArray<NSNumber *> *)valuesArray
 
     points.emplace_back(cv::Point2f(point.x, point.y));
   }
-
+  
   cv::Rect rect = cv::boundingRect(points);
   cv::Mat croppedImage = img(rect);
+  cv::cvtColor(croppedImage, croppedImage, cv::COLOR_BGR2GRAY);
+  cv::resize(croppedImage, croppedImage, cv::Size(smallVerticalRecognizerWidth, recognizerHeight), 0, 0,
+               cv::INTER_AREA);
+  cv::medianBlur(img, img, 1);
   return croppedImage;
 }
 
 + (cv::Mat)cropSingleCharacter:(cv::Mat)img {
-  cv::cvtColor(img, img, cv::COLOR_BGR2GRAY);
-  cv::resize(img, img, cv::Size(smallVerticalRecognizerWidth, recognizerHeight), 0, 0,
-             cv::INTER_AREA);
-  cv::medianBlur(img, img, 1);
 
   cv::Mat histogram;
 

From 951c50b911216f15fb7afb23f8aaa74665631993 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Fri, 7 Mar 2025 12:12:43 +0100
Subject: [PATCH 2/3] refactor: remove NSLogs

---
 ios/RnExecutorch/OCR.mm         | 1 -
 ios/RnExecutorch/VerticalOCR.mm | 1 -
 2 files changed, 2 deletions(-)

diff --git a/ios/RnExecutorch/OCR.mm b/ios/RnExecutorch/OCR.mm
index b2ac79934..bdff61271 100644
--- a/ios/RnExecutorch/OCR.mm
+++ b/ios/RnExecutorch/OCR.mm
@@ -87,7 +87,6 @@ of different sizes (e.g. large - 512x64, medium - 256x64, small - 128x64).
                                    desiredHeight:recognizerImageSize];
     resolve(result);
   } @catch (NSException *exception) {
-    NSLog(@"%@", exception.reason);
     reject(@"forward_error",
            [NSString stringWithFormat:@"%@", exception.reason], nil);
   }
diff --git a/ios/RnExecutorch/VerticalOCR.mm b/ios/RnExecutorch/VerticalOCR.mm
index bc116e102..ef5e58a2b 100644
--- a/ios/RnExecutorch/VerticalOCR.mm
+++ b/ios/RnExecutorch/VerticalOCR.mm
@@ -104,7 +104,6 @@ - (void)forward:(NSString *)input
           text = [text stringByAppendingString:decodedText[0]];
           confidenceScore = @([confidenceScore floatValue] + [[recognitionResult objectAtIndex:1] floatValue]);
         }else{
-          NSLog(@"width: %d, height: %d", croppedCharacter.cols, croppedCharacter.rows);
           croppedCharacters.push_back(croppedCharacter);
         }
       }

From 0142162bb8afcf877b96944f308d3bcae064574b Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Fri, 7 Mar 2025 13:47:37 +0100
Subject: [PATCH 3/3] chore: add audio api to example apps

---
 examples/computer-vision/package.json |  1 +
 examples/computer-vision/yarn.lock    | 13 +++++++++++++
 examples/llama/package.json           |  1 +
 examples/llama/yarn.lock              | 13 +++++++++++++
 4 files changed, 28 insertions(+)

diff --git a/examples/computer-vision/package.json b/examples/computer-vision/package.json
index 7f39a19ee..c25a60690 100644
--- a/examples/computer-vision/package.json
+++ b/examples/computer-vision/package.json
@@ -16,6 +16,7 @@
     "metro-config": "^0.81.0",
     "react": "18.3.1",
     "react-native": "0.76.3",
+    "react-native-audio-api": "^0.4.13",
     "react-native-executorch": "^0.3.0",
     "react-native-image-picker": "^7.2.2",
     "react-native-loading-spinner-overlay": "^3.0.1",
diff --git a/examples/computer-vision/yarn.lock b/examples/computer-vision/yarn.lock
index 8f2c20fe6..655cd2cfb 100644
--- a/examples/computer-vision/yarn.lock
+++ b/examples/computer-vision/yarn.lock
@@ -3889,6 +3889,7 @@ __metadata:
     metro-config: ^0.81.0
     react: 18.3.1
     react-native: 0.76.3
+    react-native-audio-api: ^0.4.13
     react-native-executorch: ^0.3.0
     react-native-image-picker: ^7.2.2
     react-native-loading-spinner-overlay: ^3.0.1
@@ -7486,6 +7487,18 @@ __metadata:
   languageName: node
   linkType: hard
 
+"react-native-audio-api@npm:^0.4.13":
+  version: 0.4.13
+  resolution: "react-native-audio-api@npm:0.4.13"
+  peerDependencies:
+    react: "*"
+    react-native: "*"
+  bin:
+    setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
+  checksum: 9a5db4626a0663224cdcdc957ed5176df9ed65df9f58bf06c5eae2a91895b7ee0f0a632ecaa79b23ef7536983485cea2720ca46201e2e690548f400b66970ff6
+  languageName: node
+  linkType: hard
+
 "react-native-executorch@npm:^0.3.0":
   version: 0.3.0
   resolution: "react-native-executorch@npm:0.3.0"
diff --git a/examples/llama/package.json b/examples/llama/package.json
index 6abbb086d..022d7afba 100644
--- a/examples/llama/package.json
+++ b/examples/llama/package.json
@@ -16,6 +16,7 @@
     "metro-config": "^0.81.0",
     "react": "18.3.1",
     "react-native": "0.76.3",
+    "react-native-audio-api": "^0.4.13",
     "react-native-executorch": "^0.3.0",
     "react-native-loading-spinner-overlay": "^3.0.1",
     "react-native-markdown-display": "^7.0.2",
diff --git a/examples/llama/yarn.lock b/examples/llama/yarn.lock
index 4b6a45168..1059fc38b 100644
--- a/examples/llama/yarn.lock
+++ b/examples/llama/yarn.lock
@@ -6074,6 +6074,7 @@ __metadata:
     metro-config: ^0.81.0
     react: 18.3.1
     react-native: 0.76.3
+    react-native-audio-api: ^0.4.13
     react-native-executorch: ^0.3.0
     react-native-loading-spinner-overlay: ^3.0.1
     react-native-markdown-display: ^7.0.2
@@ -7555,6 +7556,18 @@ __metadata:
   languageName: node
   linkType: hard
 
+"react-native-audio-api@npm:^0.4.13":
+  version: 0.4.13
+  resolution: "react-native-audio-api@npm:0.4.13"
+  peerDependencies:
+    react: "*"
+    react-native: "*"
+  bin:
+    setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
+  checksum: 9a5db4626a0663224cdcdc957ed5176df9ed65df9f58bf06c5eae2a91895b7ee0f0a632ecaa79b23ef7536983485cea2720ca46201e2e690548f400b66970ff6
+  languageName: node
+  linkType: hard
+
 "react-native-executorch@npm:^0.3.0":
   version: 0.3.0
   resolution: "react-native-executorch@npm:0.3.0"