-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: implemented vertical ocr #109
Merged
Merged
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
76b47f1
feat: implemented vertical ocr
NorbertKlockiewicz 4c118a0
refactor: refactor of vertical ocr code
NorbertKlockiewicz 75033ca
feat: added urls to models on hf repo
NorbertKlockiewicz 706e753
refactor: implement requested changes
NorbertKlockiewicz e04945d
feat: added function to calculate download progress with multiple mod…
NorbertKlockiewicz 7bc527c
feat: added controllers for ocrs to avoid duplicated code
NorbertKlockiewicz dd6fd56
feat: added thresholding to single character processing
NorbertKlockiewicz 68b6aae
feat: improved pipeline for single character processing
NorbertKlockiewicz 70e9139
feat: create constants for magic numbers(ios)
NorbertKlockiewicz b8514b2
feat: add const for min size magic number(android)
NorbertKlockiewicz 3f6a13a
fix: set progress to 1 after every file is downloaded
NorbertKlockiewicz 42aa0f2
fix: suggested changes
NorbertKlockiewicz File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
173 changes: 173 additions & 0 deletions
173
android/src/main/java/com/swmansion/rnexecutorch/VerticalOCR.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
package com.swmansion.rnexecutorch | ||
|
||
import android.util.Log | ||
import com.facebook.react.bridge.Arguments | ||
import com.facebook.react.bridge.Promise | ||
import com.facebook.react.bridge.ReactApplicationContext | ||
import com.swmansion.rnexecutorch.utils.ETError | ||
import com.swmansion.rnexecutorch.utils.ImageProcessor | ||
import org.opencv.android.OpenCVLoader | ||
import com.swmansion.rnexecutorch.models.ocr.Recognizer | ||
import com.swmansion.rnexecutorch.models.ocr.VerticalDetector | ||
import com.swmansion.rnexecutorch.models.ocr.utils.CTCLabelConverter | ||
import com.swmansion.rnexecutorch.models.ocr.utils.Constants | ||
import com.swmansion.rnexecutorch.models.ocr.utils.RecognizerUtils | ||
import org.opencv.core.Core | ||
import org.opencv.core.Mat | ||
|
||
class VerticalOCR(reactContext: ReactApplicationContext) : | ||
NativeVerticalOCRSpec(reactContext) { | ||
|
||
private lateinit var detectorLarge: VerticalDetector | ||
private lateinit var detectorNarrow: VerticalDetector | ||
private lateinit var recognizer: Recognizer | ||
private lateinit var converter: CTCLabelConverter | ||
private var independentCharacters = true | ||
|
||
companion object { | ||
const val NAME = "VerticalOCR" | ||
} | ||
|
||
init { | ||
if (!OpenCVLoader.initLocal()) { | ||
Log.d("rn_executorch", "OpenCV not loaded") | ||
} else { | ||
Log.d("rn_executorch", "OpenCV loaded") | ||
} | ||
} | ||
|
||
override fun loadModule( | ||
detectorLargeSource: String, | ||
detectorNarrowSource: String, | ||
recognizerSource: String, | ||
symbols: String, | ||
independentCharacters: Boolean, | ||
promise: Promise | ||
) { | ||
try { | ||
this.independentCharacters = independentCharacters | ||
detectorLarge = VerticalDetector(false, reactApplicationContext) | ||
detectorLarge.loadModel(detectorLargeSource) | ||
detectorNarrow = VerticalDetector(true, reactApplicationContext) | ||
detectorNarrow.loadModel(detectorNarrowSource) | ||
recognizer = Recognizer(reactApplicationContext) | ||
recognizer.loadModel(recognizerSource) | ||
|
||
converter = CTCLabelConverter(symbols) | ||
|
||
promise.resolve(0) | ||
} catch (e: Exception) { | ||
promise.reject(e.message!!, ETError.InvalidModelSource.toString()) | ||
} | ||
} | ||
|
||
override fun forward(input: String, promise: Promise) { | ||
try { | ||
val inputImage = ImageProcessor.readImage(input) | ||
val result = detectorLarge.runModel(inputImage) | ||
val largeDetectorSize = detectorLarge.getModelImageSize() | ||
val resizedImage = ImageProcessor.resizeWithPadding( | ||
inputImage, | ||
largeDetectorSize.width.toInt(), | ||
largeDetectorSize.height.toInt() | ||
) | ||
val predictions = Arguments.createArray() | ||
for (box in result) { | ||
val cords = box.bBox | ||
val boxWidth = cords[2].x - cords[0].x | ||
val boxHeight = cords[2].y - cords[0].y | ||
|
||
val boundingBox = RecognizerUtils.extractBoundingBox(cords) | ||
val croppedImage = Mat(resizedImage, boundingBox) | ||
|
||
val paddings = RecognizerUtils.calculateResizeRatioAndPaddings( | ||
inputImage.width(), | ||
inputImage.height(), | ||
largeDetectorSize.width.toInt(), | ||
largeDetectorSize.height.toInt() | ||
) | ||
|
||
var text = "" | ||
var confidenceScore = 0.0 | ||
val boxResult = detectorNarrow.runModel(croppedImage) | ||
val narrowDetectorSize = detectorNarrow.getModelImageSize() | ||
|
||
val croppedCharacters = mutableListOf<Mat>() | ||
|
||
for (characterBox in boxResult) { | ||
val boxCords = characterBox.bBox | ||
val paddingsBox = RecognizerUtils.calculateResizeRatioAndPaddings( | ||
boxWidth.toInt(), | ||
boxHeight.toInt(), | ||
narrowDetectorSize.width.toInt(), | ||
narrowDetectorSize.height.toInt() | ||
) | ||
|
||
var croppedCharacter = RecognizerUtils.cropImageWithBoundingBox( | ||
inputImage, | ||
boxCords, | ||
cords, | ||
paddingsBox, | ||
paddings | ||
) | ||
|
||
if (this.independentCharacters) { | ||
croppedCharacter = RecognizerUtils.cropSingleCharacter(croppedCharacter) | ||
croppedCharacter = RecognizerUtils.normalizeForRecognizer(croppedCharacter, 0.0, true) | ||
val recognitionResult = recognizer.runModel(croppedCharacter) | ||
val predIndex = recognitionResult.first | ||
val decodedText = converter.decodeGreedy(predIndex, predIndex.size) | ||
text += decodedText[0] | ||
confidenceScore += recognitionResult.second | ||
} else { | ||
croppedCharacters.add(croppedCharacter) | ||
} | ||
} | ||
|
||
if (this.independentCharacters) { | ||
confidenceScore /= boxResult.size | ||
} else { | ||
var mergedCharacters = Mat() | ||
Core.hconcat(croppedCharacters, mergedCharacters) | ||
mergedCharacters = ImageProcessor.resizeWithPadding( | ||
mergedCharacters, | ||
Constants.LARGE_MODEL_WIDTH, | ||
Constants.MODEL_HEIGHT | ||
) | ||
mergedCharacters = RecognizerUtils.normalizeForRecognizer(mergedCharacters, 0.0) | ||
|
||
val recognitionResult = recognizer.runModel(mergedCharacters) | ||
val predIndex = recognitionResult.first | ||
val decodedText = converter.decodeGreedy(predIndex, predIndex.size) | ||
|
||
text = decodedText[0] | ||
confidenceScore = recognitionResult.second | ||
} | ||
|
||
for (bBox in box.bBox) { | ||
bBox.x = | ||
(bBox.x - paddings["left"] as Int) * paddings["resizeRatio"] as Float | ||
bBox.y = | ||
(bBox.y - paddings["top"] as Int) * paddings["resizeRatio"] as Float | ||
} | ||
|
||
val resMap = Arguments.createMap() | ||
|
||
resMap.putString("text", text) | ||
resMap.putArray("bbox", box.toWritableArray()) | ||
resMap.putDouble("confidence", confidenceScore) | ||
|
||
predictions.pushMap(resMap) | ||
} | ||
|
||
promise.resolve(predictions) | ||
} catch (e: Exception) { | ||
Log.d("rn_executorch", "Error running model: ${e.message}") | ||
promise.reject(e.message!!, e.message) | ||
} | ||
} | ||
|
||
override fun getName(): String { | ||
return NAME | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would put some of the logic inside something like recognitionHandler in horizontal OCR to make it more maintainable
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was thinking about it, however whole ocr consists of many files already and I felt like adding another file which will be only wrapper won't help that much.