From c6f141f55c7a0659c833168dc13da70d2b70ea9c Mon Sep 17 00:00:00 2001 From: jakmro Date: Mon, 17 Feb 2025 15:39:03 +0100 Subject: [PATCH 1/8] Add downloadProgress field documentation --- docs/docs/computer-vision/useClassification.md | 13 +++++++------ docs/docs/computer-vision/useObjectDetection.md | 13 +++++++------ docs/docs/computer-vision/useStyleTransfer.md | 13 +++++++------ docs/docs/module-api/executorch-bindings.md | 17 +++++++++-------- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/docs/docs/computer-vision/useClassification.md b/docs/docs/computer-vision/useClassification.md index db33fed1..98945be6 100644 --- a/docs/docs/computer-vision/useClassification.md +++ b/docs/docs/computer-vision/useClassification.md @@ -38,12 +38,13 @@ A string that specifies the location of the model binary. For more information, ### Returns -| Field | Type | Description | -| -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise<{ [category: string]: number }>` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| Field | Type | Description | +| ------------------ | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise<{ [category: string]: number }>` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | ## Running the model diff --git a/docs/docs/computer-vision/useObjectDetection.md b/docs/docs/computer-vision/useObjectDetection.md index a0e30337..dae1aeee 100644 --- a/docs/docs/computer-vision/useObjectDetection.md +++ b/docs/docs/computer-vision/useObjectDetection.md @@ -61,12 +61,13 @@ For more information on that topic, you can check out the [Loading models](https The hook returns an object with the following properties: -| Field | Type | Description | -| -------------- | ----------------------------------------- | ---------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `Detection` objects. | -| `error` | string | null | Contains the error message if the model loading failed. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| Field | Type | Description | +| ------------------ | ----------------------------------------- | ---------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `Detection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | ## Running the model diff --git a/docs/docs/computer-vision/useStyleTransfer.md b/docs/docs/computer-vision/useStyleTransfer.md index 6a8a3461..5b24df13 100644 --- a/docs/docs/computer-vision/useStyleTransfer.md +++ b/docs/docs/computer-vision/useStyleTransfer.md @@ -37,12 +37,13 @@ A string that specifies the location of the model binary. For more information, ### Returns -| Field | Type | Description | -| -------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| Field | Type | Description | +| ------------------ | ------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | ## Running the model diff --git a/docs/docs/module-api/executorch-bindings.md b/docs/docs/module-api/executorch-bindings.md index 2cfbfa5c..7e82208c 100644 --- a/docs/docs/module-api/executorch-bindings.md +++ b/docs/docs/module-api/executorch-bindings.md @@ -29,14 +29,15 @@ The `modelSource` parameter expects a location string pointing to the model bina ### Returns -| Field | Type | Description | -| -------------- | ---------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `loadMethod` | `(methodName: string) => Promise` | Loads resources specific to `methodName` into memory before execution. | -| `loadForward` | `() => Promise` | Loads resources specific to `forward` method into memory before execution. Uses `loadMethod` under the hood. | -| `forward` | `(input: ETInput, shape: number[]) => Promise` | Executes the model's forward pass, where `input` is a Javascript typed array and `shape` is an array of integers representing input Tensor shape. The output is a Tensor - raw result of inference. | +| Field | Type | Description | +| ------------------ | ---------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `loadMethod` | `(methodName: string) => Promise` | Loads resources specific to `methodName` into memory before execution. | +| `loadForward` | `() => Promise` | Loads resources specific to `forward` method into memory before execution. Uses `loadMethod` under the hood. | +| `forward` | `(input: ETInput, shape: number[]) => Promise` | Executes the model's forward pass, where `input` is a Javascript typed array and `shape` is an array of integers representing input Tensor shape. The output is a Tensor - raw result of inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | ## ETInput From c17302bb042dcae70d36cb73834b727560828677 Mon Sep 17 00:00:00 2001 From: jakmro Date: Tue, 18 Feb 2025 15:01:59 +0100 Subject: [PATCH 2/8] Revert "Add downloadProgress field documentation" This reverts commit c6f141f55c7a0659c833168dc13da70d2b70ea9c. --- docs/docs/computer-vision/useClassification.md | 13 ++++++------- docs/docs/computer-vision/useObjectDetection.md | 13 ++++++------- docs/docs/computer-vision/useStyleTransfer.md | 13 ++++++------- docs/docs/module-api/executorch-bindings.md | 17 ++++++++--------- 4 files changed, 26 insertions(+), 30 deletions(-) diff --git a/docs/docs/computer-vision/useClassification.md b/docs/docs/computer-vision/useClassification.md index 98945be6..db33fed1 100644 --- a/docs/docs/computer-vision/useClassification.md +++ b/docs/docs/computer-vision/useClassification.md @@ -38,13 +38,12 @@ A string that specifies the location of the model binary. For more information, ### Returns -| Field | Type | Description | -| ------------------ | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise<{ [category: string]: number }>` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | +| Field | Type | Description | +| -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise<{ [category: string]: number }>` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | ## Running the model diff --git a/docs/docs/computer-vision/useObjectDetection.md b/docs/docs/computer-vision/useObjectDetection.md index dae1aeee..a0e30337 100644 --- a/docs/docs/computer-vision/useObjectDetection.md +++ b/docs/docs/computer-vision/useObjectDetection.md @@ -61,13 +61,12 @@ For more information on that topic, you can check out the [Loading models](https The hook returns an object with the following properties: -| Field | Type | Description | -| ------------------ | ----------------------------------------- | ---------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `Detection` objects. | -| `error` | string | null | Contains the error message if the model loading failed. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | +| Field | Type | Description | +| -------------- | ----------------------------------------- | ---------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `Detection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | ## Running the model diff --git a/docs/docs/computer-vision/useStyleTransfer.md b/docs/docs/computer-vision/useStyleTransfer.md index 5b24df13..6a8a3461 100644 --- a/docs/docs/computer-vision/useStyleTransfer.md +++ b/docs/docs/computer-vision/useStyleTransfer.md @@ -37,13 +37,12 @@ A string that specifies the location of the model binary. For more information, ### Returns -| Field | Type | Description | -| ------------------ | ------------------------------------ | -------------------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | +| Field | Type | Description | +| -------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | ## Running the model diff --git a/docs/docs/module-api/executorch-bindings.md b/docs/docs/module-api/executorch-bindings.md index 7e82208c..2cfbfa5c 100644 --- a/docs/docs/module-api/executorch-bindings.md +++ b/docs/docs/module-api/executorch-bindings.md @@ -29,15 +29,14 @@ The `modelSource` parameter expects a location string pointing to the model bina ### Returns -| Field | Type | Description | -| ------------------ | ---------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `loadMethod` | `(methodName: string) => Promise` | Loads resources specific to `methodName` into memory before execution. | -| `loadForward` | `() => Promise` | Loads resources specific to `forward` method into memory before execution. Uses `loadMethod` under the hood. | -| `forward` | `(input: ETInput, shape: number[]) => Promise` | Executes the model's forward pass, where `input` is a Javascript typed array and `shape` is an array of integers representing input Tensor shape. The output is a Tensor - raw result of inference. | -| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1 | +| Field | Type | Description | +| -------------- | ---------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `loadMethod` | `(methodName: string) => Promise` | Loads resources specific to `methodName` into memory before execution. | +| `loadForward` | `() => Promise` | Loads resources specific to `forward` method into memory before execution. Uses `loadMethod` under the hood. | +| `forward` | `(input: ETInput, shape: number[]) => Promise` | Executes the model's forward pass, where `input` is a Javascript typed array and `shape` is an array of integers representing input Tensor shape. The output is a Tensor - raw result of inference. | ## ETInput From 7cc6ebef4a906be51e1f80a45d9ac5c1b5b6ee71 Mon Sep 17 00:00:00 2001 From: Jakub Mroz <115979017+jakmro@users.noreply.github.com> Date: Tue, 18 Feb 2025 15:10:54 +0100 Subject: [PATCH 3/8] docs: Hookless-API (#90) ## Description Add documentation for hookless-API ### Type of change - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [x] Documentation update (improves or adds clarity to existing documentation) ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings --------- Co-authored-by: Jakub Chmura <92989966+chmjkb@users.noreply.github.com> --- .../docs/hookless-api/ClassificationModule.md | 48 ++++++++++ docs/docs/hookless-api/ExecutorchModule.md | 71 +++++++++++++++ docs/docs/hookless-api/LLMModule.md | 89 +++++++++++++++++++ .../hookless-api/ObjectDetectionModule.md | 61 +++++++++++++ docs/docs/hookless-api/StyleTransferModule.md | 48 ++++++++++ docs/docs/hookless-api/_category_.json | 7 ++ 6 files changed, 324 insertions(+) create mode 100644 docs/docs/hookless-api/ClassificationModule.md create mode 100644 docs/docs/hookless-api/ExecutorchModule.md create mode 100644 docs/docs/hookless-api/LLMModule.md create mode 100644 docs/docs/hookless-api/ObjectDetectionModule.md create mode 100644 docs/docs/hookless-api/StyleTransferModule.md create mode 100644 docs/docs/hookless-api/_category_.json diff --git a/docs/docs/hookless-api/ClassificationModule.md b/docs/docs/hookless-api/ClassificationModule.md new file mode 100644 index 00000000..732971db --- /dev/null +++ b/docs/docs/hookless-api/ClassificationModule.md @@ -0,0 +1,48 @@ +--- +title: ClassificationModule +sidebar_position: 1 +--- + +Hookless implementation of the [useClassification](../computer-vision/useClassification.mdx) hook. + +## Reference + +```typescript +import { + ClassificationModule, + EFFICIENTNET_V2_S, +} from 'react-native-executorch'; + +const imageUri = 'path/to/image.png'; + +// Loading the model +await ClassificationModule.load(EFFICIENTNET_V2_S); + +// Running the model +const classesWithProbabilities = await ClassificationModule.forward(imageUri); +``` + +### Methods + +| Method | Type | Description | +| -------------------- | ---------------------------------------------------------- | -------------------------------------------------------------------------------------------------------- | +| `load` | `(modelSource: ResourceSource): Promise` | Loads the model, where `modelSource` is a string that specifies the location of the model binary. | +| `forward` | `(input: string): Promise<{ [category: string]: number }>` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | + +
+Type definitions + +```typescript +type ResourceSource = string | number; +``` + +
+ +## Loading the model + +To load the model, use the `load` method. It accepts the `modelSource` which is a string that specifies the location of the model binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void. + +## Running the model + +To run the model, you can use the `forward` method. It accepts one argument, which is the image. The image can be a remote URL, a local file URI, or a base64-encoded image. The method returns a promise, which can resolve either to an error or an object containing categories with their probabilities. diff --git a/docs/docs/hookless-api/ExecutorchModule.md b/docs/docs/hookless-api/ExecutorchModule.md new file mode 100644 index 00000000..db3c3cd7 --- /dev/null +++ b/docs/docs/hookless-api/ExecutorchModule.md @@ -0,0 +1,71 @@ +--- +title: ExecuTorchModule +sidebar_position: 5 +--- + +Hookless implementation of the [useExecutorchModule](../module-api/executorch-bindings.md) hook. + +## Reference + +```typescript +import { + ExecutorchModule, + STYLE_TRANSFER_CANDY, +} from 'react-native-executorch'; + +// Creating the input array +const shape = [1, 3, 640, 640]; +const input = new Float32Array(1 * 3 * 640 * 640); + +// Loading the model +await ExecutorchModule.load(STYLE_TRANSFER_CANDY); + +// Running the model +const output = await ExecutorchModule.forward(input, shape); +``` + +### Methods + +| Method | Type | Description | +| -------------------- | ------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `load` | `(modelSource: ResourceSource): Promise` | Loads the model, where `modelSource` is a string that specifies the location of the model binary. | +| `forward` | `(input: ETInput, shape: number[]): Promise` | Executes the model's forward pass, where `input` is a JavaScript typed array and `shape` is an array of integers representing input Tensor shape. The output is a Tensor - raw result of inference. | +| `loadMethod` | `(methodName: string): Promise` | Loads resources specific to `methodName` into memory before execution. | +| `loadForward` | `(): Promise` | Loads resources specific to `forward` method into memory before execution. Uses `loadMethod` under the hood. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | + +
+Type definitions + +```typescript +type ResourceSource = string | number; + +export type ETInput = + | Int8Array + | Int32Array + | BigInt64Array + | Float32Array + | Float64Array; +``` + +
+ +## Loading the model + +To load the model, use the `load` method. It accepts the `modelSource` which is a string that specifies the location of the model binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void. + +## Running the model + +To run the model use the `forward` method. It accepts two arguments: `input` and `shape`. The `input` is a JavaScript typed array, and `shape` is an array of integers representing the input tensor shape. There's no need to explicitly define the input type, as it will automatically be inferred from the typed array you pass to forward method. Outputs from the model, such as classification probabilities, are returned in raw format. + +## Loading methods + +Loads resources specific to methodName into memory before execution. + +## Loading forward + +Loads resources specific to `forward` method into memory before execution. Uses loadMethod under the hood. + +:::info +This code assumes that you have handled preprocessing of the input image (scaling, normalization) and postprocessing of the output (interpreting the raw output data) according to the model's requirements. Make sure to adjust these parts depending on your specific data and model outputs. +::: diff --git a/docs/docs/hookless-api/LLMModule.md b/docs/docs/hookless-api/LLMModule.md new file mode 100644 index 00000000..005a2f7c --- /dev/null +++ b/docs/docs/hookless-api/LLMModule.md @@ -0,0 +1,89 @@ +--- +title: LLMModule +sidebar_position: 4 +--- + +Hookless implementation of the [useLLM](../llms/running-llms.md) hook. + +## Reference + +```typescript +import { + LLMModule, + LLAMA3_2_1B_QLORA, + LLAMA3_2_1B_TOKENIZER, +} from 'react-native-executorch'; + +// Listening for download progress +LLMModule.onDownloadProgress((progress) => { + console.log(progress); +}); + +// Loading the model +await LLMModule.load(LLAMA3_2_1B_QLORA, LLAMA3_2_1B_TOKENIZER); + +// Listening for token +LLMModule.onToken((token) => { + console.log(token); +}); + +// Running the model +LLMModule.generate('Hello, World!'); + +// Interrupting the model +LLMModule.interrupt(); + +// Deleting the model from memory +LLMModule.delete(); +``` + +### Methods + +| Method | Type | Description | +| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------ | +| `load` | `(modelSource: ResourceSource, tokenizerSource: ResourceSource, systemPrompt?: string, contextWindowLength?: number): Promise` | Loads the model. Checkout the [loading the model](#loading-the-model) section for details. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | +| `generate` | `(input: string): Promise` | Method to start generating a response with the given input string. | +| `onToken` | (callback: (data: string | undefined) => void): any | Subscribe to the token generation event. | +| `interrupt` | `(): void` | Method to interrupt the current inference | +| `delete` | `(): void` | Method to delete the model from memory. | + +
+Type definitions + +```typescript +type ResourceSource = string | number; +``` + +
+ +## Loading the model + +To load the model, use the `load` method. It accepts: + +- `modelSource` - A string that specifies the location of the model binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. +- `tokenizerSource` - URL to the binary file which contains the tokenizer +- `systemPrompt` - Often used to tell the model what is its purpose, for example - "Be a helpful translator" +- `contextWindowLength` - The number of messages from the current conversation that the model will use to generate a response. The higher the number, the more context the model will have. Keep in mind that using larger context windows will result in longer inference time and higher memory usage. + +This method returns a promise, which can resolve to an error or void. + +## Listening for download progress + +To subscribe to the download progress event, you can use the `onDownloadProgress` method. It accepts a callback function that will be called whenever the download progress changes. + +## Running the model + +To run the model, you can use the `generate` method. It accepts one argument, which is the input string. The method returns a promise, which can resolve to an error or void. + +## Listening for token + +To subscribe to the token event, you can use the `onToken` method. It accepts a callback function that will be called whenever a token is generated. + +## Interrupting the model + +In order to interrupt the model, you can use the `interrupt` method. + +## Deleting the model from memory + +To delete the model from memory, you can use the `delete` method. diff --git a/docs/docs/hookless-api/ObjectDetectionModule.md b/docs/docs/hookless-api/ObjectDetectionModule.md new file mode 100644 index 00000000..eaaf644d --- /dev/null +++ b/docs/docs/hookless-api/ObjectDetectionModule.md @@ -0,0 +1,61 @@ +--- +title: ObjectDetectionModule +sidebar_position: 2 +--- + +Hookless implementation of the [useObjectDetection](../computer-vision/useObjectDetection.mdx) hook. + +## Reference + +```typescript +import { + ObjectDetectionModule, + SSDLITE_320_MOBILENET_V3_LARGE, +} from 'react-native-executorch'; + +const imageUri = 'path/to/image.png'; + +// Loading the model +await ObjectDetectionModule.load(SSDLITE_320_MOBILENET_V3_LARGE); + +// Running the model +const detections = await ObjectDetectionModule.forward(imageUri); +``` + +### Methods + +| Method | Type | Description | +| -------------------- | ----------------------------------------------------- | -------------------------------------------------------------------------------------------------------- | +| `load` | `(modelSource: ResourceSource): Promise` | Loads the model, where `modelSource` is a string that specifies the location of the model binary. | +| `forward` | `(input: string): Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | + +
+Type definitions + +```typescript +type ResourceSource = string | number; + +interface Bbox { + x1: number; + x2: number; + y1: number; + y2: number; +} + +interface Detection { + bbox: Bbox; + label: keyof typeof CocoLabel; + score: number; +} +``` + +
+ +## Loading the model + +To load the model, use the `load` method. It accepts the `modelSource` which is a string that specifies the location of the model binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void. + +## Running the model + +To run the model, you can use the `forward` method. It accepts one argument, which is the image. The image can be a remote URL, a local file URI, or a base64-encoded image. The method returns a promise, which can resolve either to an error or an array of `Detection` objects. Each object contains coordinates of the bounding box, the label of the detected object, and the confidence score. diff --git a/docs/docs/hookless-api/StyleTransferModule.md b/docs/docs/hookless-api/StyleTransferModule.md new file mode 100644 index 00000000..d1d22023 --- /dev/null +++ b/docs/docs/hookless-api/StyleTransferModule.md @@ -0,0 +1,48 @@ +--- +title: StyleTransferModule +sidebar_position: 3 +--- + +Hookless implementation of the [useStyleTransfer](../computer-vision/useStyleTransfer.mdx) hook. + +## Reference + +```typescript +import { + StyleTransferModule, + STYLE_TRANSFER_CANDY, +} from 'react-native-executorch'; + +const imageUri = 'path/to/image.png'; + +// Loading the model +await StyleTransferModule.load(STYLE_TRANSFER_CANDY); + +// Running the model +const generatedImageUrl = await StyleTransferModule.forward(imageUri); +``` + +### Methods + +| Method | Type | Description | +| -------------------- | ----------------------------------------------------- | -------------------------------------------------------------------------------------------------------- | +| `load` | `(modelSource: ResourceSource): Promise` | Loads the model, where `modelSource` is a string that specifies the location of the model binary. | +| `forward` | `(input: string): Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | + +
+Type definitions + +```typescript +type ResourceSource = string | number; +``` + +
+ +## Loading the model + +To load the model, use the `load` method. It accepts the `modelSource` which is a string that specifies the location of the model binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void. + +## Running the model + +To run the model, you can use the `forward` method. It accepts one argument, which is the image. The image can be a remote URL, a local file URI, or a base64-encoded image. The method returns a promise, which can resolve either to an error or a URL to generated image. diff --git a/docs/docs/hookless-api/_category_.json b/docs/docs/hookless-api/_category_.json new file mode 100644 index 00000000..6c0a8908 --- /dev/null +++ b/docs/docs/hookless-api/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Hookless API", + "position": 5, + "link": { + "type": "generated-index" + } +} From af9f7877975820e57c071b93dd4f09852f689398 Mon Sep 17 00:00:00 2001 From: jakmro Date: Tue, 18 Feb 2025 15:27:08 +0100 Subject: [PATCH 4/8] Add downloadProgress field --- docs/docs/computer-vision/useClassification.md | 13 +++++++------ docs/docs/computer-vision/useObjectDetection.md | 13 +++++++------ docs/docs/computer-vision/useStyleTransfer.md | 13 +++++++------ docs/docs/module-api/executorch-bindings.md | 17 +++++++++-------- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/docs/docs/computer-vision/useClassification.md b/docs/docs/computer-vision/useClassification.md index db33fed1..cd4a650b 100644 --- a/docs/docs/computer-vision/useClassification.md +++ b/docs/docs/computer-vision/useClassification.md @@ -38,12 +38,13 @@ A string that specifies the location of the model binary. For more information, ### Returns -| Field | Type | Description | -| -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise<{ [category: string]: number }>` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| Field | Type | Description | +| ------------------ | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise<{ [category: string]: number }>` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## Running the model diff --git a/docs/docs/computer-vision/useObjectDetection.md b/docs/docs/computer-vision/useObjectDetection.md index a0e30337..dfda0718 100644 --- a/docs/docs/computer-vision/useObjectDetection.md +++ b/docs/docs/computer-vision/useObjectDetection.md @@ -61,12 +61,13 @@ For more information on that topic, you can check out the [Loading models](https The hook returns an object with the following properties: -| Field | Type | Description | -| -------------- | ----------------------------------------- | ---------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `Detection` objects. | -| `error` | string | null | Contains the error message if the model loading failed. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| Field | Type | Description | +| ------------------ | ----------------------------------------- | ---------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `Detection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## Running the model diff --git a/docs/docs/computer-vision/useStyleTransfer.md b/docs/docs/computer-vision/useStyleTransfer.md index 6a8a3461..f3a8a1b1 100644 --- a/docs/docs/computer-vision/useStyleTransfer.md +++ b/docs/docs/computer-vision/useStyleTransfer.md @@ -37,12 +37,13 @@ A string that specifies the location of the model binary. For more information, ### Returns -| Field | Type | Description | -| -------------- | ------------------------------------ | -------------------------------------------------------------------------------------------------------- | -| `forward` | `(input: string) => Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| Field | Type | Description | +| ------------------ | ------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## Running the model diff --git a/docs/docs/module-api/executorch-bindings.md b/docs/docs/module-api/executorch-bindings.md index 2cfbfa5c..282beaf5 100644 --- a/docs/docs/module-api/executorch-bindings.md +++ b/docs/docs/module-api/executorch-bindings.md @@ -29,14 +29,15 @@ The `modelSource` parameter expects a location string pointing to the model bina ### Returns -| Field | Type | Description | -| -------------- | ---------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `error` | string | null | Contains the error message if the model failed to load. | -| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | -| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | -| `loadMethod` | `(methodName: string) => Promise` | Loads resources specific to `methodName` into memory before execution. | -| `loadForward` | `() => Promise` | Loads resources specific to `forward` method into memory before execution. Uses `loadMethod` under the hood. | -| `forward` | `(input: ETInput, shape: number[]) => Promise` | Executes the model's forward pass, where `input` is a Javascript typed array and `shape` is an array of integers representing input Tensor shape. The output is a Tensor - raw result of inference. | +| Field | Type | Description | +| ------------------ | ---------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `error` | string | null | Contains the error message if the model failed to load. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `loadMethod` | `(methodName: string) => Promise` | Loads resources specific to `methodName` into memory before execution. | +| `loadForward` | `() => Promise` | Loads resources specific to `forward` method into memory before execution. Uses `loadMethod` under the hood. | +| `forward` | `(input: ETInput, shape: number[]) => Promise` | Executes the model's forward pass, where `input` is a Javascript typed array and `shape` is an array of integers representing input Tensor shape. The output is a Tensor - raw result of inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | ## ETInput From b85d2e3dbdb27ada9bda9f0acfb841eec7612d36 Mon Sep 17 00:00:00 2001 From: Jakub Mroz <115979017+jakmro@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:21:49 +0100 Subject: [PATCH 5/8] docs: Add listDownloadedFiles and listDownloadedModels documentation (#99) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description Add listDownloadedFiles and listDownloadedModels documentation ### Type of change - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [x] Documentation update (improves or adds clarity to existing documentation) ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings --------- Co-authored-by: Mateusz Kopciński --- docs/docs/benchmarks/_category_.json | 2 +- .../computer-vision/useObjectDetection.md | 2 +- docs/docs/computer-vision/useStyleTransfer.md | 2 +- docs/docs/hookless-api/ExecutorchModule.md | 2 +- docs/docs/hookless-api/LLMModule.md | 2 +- .../hookless-api/ObjectDetectionModule.md | 2 +- docs/docs/hookless-api/StyleTransferModule.md | 2 +- docs/docs/hookless-api/_category_.json | 2 +- docs/docs/module-api/_category_.json | 2 +- docs/docs/utils/_category_.json | 7 +++ docs/docs/utils/list-downloaded-resources.md | 43 +++++++++++++++++++ 11 files changed, 59 insertions(+), 9 deletions(-) create mode 100644 docs/docs/utils/_category_.json create mode 100644 docs/docs/utils/list-downloaded-resources.md diff --git a/docs/docs/benchmarks/_category_.json b/docs/docs/benchmarks/_category_.json index 8e10f7a3..001b3495 100644 --- a/docs/docs/benchmarks/_category_.json +++ b/docs/docs/benchmarks/_category_.json @@ -1,6 +1,6 @@ { "label": "Benchmarks", - "position": 5, + "position": 7, "link": { "type": "generated-index" } diff --git a/docs/docs/computer-vision/useObjectDetection.md b/docs/docs/computer-vision/useObjectDetection.md index dfda0718..0c71958b 100644 --- a/docs/docs/computer-vision/useObjectDetection.md +++ b/docs/docs/computer-vision/useObjectDetection.md @@ -1,6 +1,6 @@ --- title: useObjectDetection -sidebar_position: 2 +sidebar_position: 3 --- Object detection is a computer vision technique that identifies and locates objects within images or video. It’s commonly used in applications like image recognition, video surveillance or autonomous driving. diff --git a/docs/docs/computer-vision/useStyleTransfer.md b/docs/docs/computer-vision/useStyleTransfer.md index f3a8a1b1..f58919e8 100644 --- a/docs/docs/computer-vision/useStyleTransfer.md +++ b/docs/docs/computer-vision/useStyleTransfer.md @@ -1,6 +1,6 @@ --- title: useStyleTransfer -sidebar_position: 3 +sidebar_position: 2 --- Style transfer is a technique used in computer graphics and machine learning where the visual style of one image is applied to the content of another. This is achieved using algorithms that manipulate data from both images, typically with the aid of a neural network. The result is a new image that combines the artistic elements of one picture with the structural details of another, effectively merging art with traditional imagery. React Native ExecuTorch offers a dedicated hook `useStyleTransfer`, for this task. However before you start you'll need to obtain ExecuTorch-compatible model binary. diff --git a/docs/docs/hookless-api/ExecutorchModule.md b/docs/docs/hookless-api/ExecutorchModule.md index db3c3cd7..0b870952 100644 --- a/docs/docs/hookless-api/ExecutorchModule.md +++ b/docs/docs/hookless-api/ExecutorchModule.md @@ -1,6 +1,6 @@ --- title: ExecuTorchModule -sidebar_position: 5 +sidebar_position: 2 --- Hookless implementation of the [useExecutorchModule](../module-api/executorch-bindings.md) hook. diff --git a/docs/docs/hookless-api/LLMModule.md b/docs/docs/hookless-api/LLMModule.md index 005a2f7c..7656db14 100644 --- a/docs/docs/hookless-api/LLMModule.md +++ b/docs/docs/hookless-api/LLMModule.md @@ -1,6 +1,6 @@ --- title: LLMModule -sidebar_position: 4 +sidebar_position: 3 --- Hookless implementation of the [useLLM](../llms/running-llms.md) hook. diff --git a/docs/docs/hookless-api/ObjectDetectionModule.md b/docs/docs/hookless-api/ObjectDetectionModule.md index eaaf644d..2cc3504e 100644 --- a/docs/docs/hookless-api/ObjectDetectionModule.md +++ b/docs/docs/hookless-api/ObjectDetectionModule.md @@ -1,6 +1,6 @@ --- title: ObjectDetectionModule -sidebar_position: 2 +sidebar_position: 5 --- Hookless implementation of the [useObjectDetection](../computer-vision/useObjectDetection.mdx) hook. diff --git a/docs/docs/hookless-api/StyleTransferModule.md b/docs/docs/hookless-api/StyleTransferModule.md index d1d22023..f084d8ca 100644 --- a/docs/docs/hookless-api/StyleTransferModule.md +++ b/docs/docs/hookless-api/StyleTransferModule.md @@ -1,6 +1,6 @@ --- title: StyleTransferModule -sidebar_position: 3 +sidebar_position: 4 --- Hookless implementation of the [useStyleTransfer](../computer-vision/useStyleTransfer.mdx) hook. diff --git a/docs/docs/hookless-api/_category_.json b/docs/docs/hookless-api/_category_.json index 6c0a8908..e96f5186 100644 --- a/docs/docs/hookless-api/_category_.json +++ b/docs/docs/hookless-api/_category_.json @@ -1,6 +1,6 @@ { "label": "Hookless API", - "position": 5, + "position": 4, "link": { "type": "generated-index" } diff --git a/docs/docs/module-api/_category_.json b/docs/docs/module-api/_category_.json index 0a152175..b0400018 100644 --- a/docs/docs/module-api/_category_.json +++ b/docs/docs/module-api/_category_.json @@ -1,6 +1,6 @@ { "label": "Module API", - "position": 4, + "position": 5, "link": { "type": "generated-index" } diff --git a/docs/docs/utils/_category_.json b/docs/docs/utils/_category_.json new file mode 100644 index 00000000..4bbbc173 --- /dev/null +++ b/docs/docs/utils/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Utils", + "position": 6, + "link": { + "type": "generated-index" + } +} diff --git a/docs/docs/utils/list-downloaded-resources.md b/docs/docs/utils/list-downloaded-resources.md new file mode 100644 index 00000000..dd718255 --- /dev/null +++ b/docs/docs/utils/list-downloaded-resources.md @@ -0,0 +1,43 @@ +--- +title: List Downloaded Resources +sidebar_position: 1 +--- + +This module provides functions to retrieve a list of downloaded files stored in the application's document directory inside the `react-native-executorch/` directory. These utilities can help you manage your storage and clean up the downloaded files when they are no longer needed. + +## listDownloadedFiles + +Lists all the downloaded files used by React Native ExecuTorch. + +### Reference + +```typescript +import { listDownloadedFiles } from 'react-native-executorch'; + +const filesUris = await listDownloadedFiles(); +``` + +### Returns + +`Promise` - A promise, which resolves to an array of URIs for all the downloaded files. + +:::info +Since this function returns all the downloaded files, it also includes all the downloaded models. +If you want to list only the downloaded models, use the [listDownloadedModels](./list-downloaded-resources.md#listdownloadedmodels) function. +::: + +## listDownloadedModels + +Lists all the downloaded models used by React Native ExecuTorch. + +### Reference + +```typescript +import { listDownloadedModels } from 'react-native-executorch'; + +const modelsUris = await listDownloadedModels(); +``` + +### Returns + +`Promise` - A promise, which resolves to an array of URIs for all the downloaded models. From 642dfe51d8e87b517d59120176e1b8d68053a902 Mon Sep 17 00:00:00 2001 From: Jakub Mroz <115979017+jakmro@users.noreply.github.com> Date: Wed, 26 Feb 2025 09:57:25 +0100 Subject: [PATCH 6/8] docs: Add LLM messageHistory (#108) ## Description Add LLM messageHistory documentation ### Type of change - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [x] Documentation update (improves or adds clarity to existing documentation) ### Checklist - [x] I have performed a self-review of my code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [x] My changes generate no new warnings --- docs/docs/hookless-api/LLMModule.md | 22 ++++--- docs/docs/llms/{running-llms.md => useLLM.md} | 59 ++++++++++++++++++- 2 files changed, 71 insertions(+), 10 deletions(-) rename docs/docs/llms/{running-llms.md => useLLM.md} (86%) diff --git a/docs/docs/hookless-api/LLMModule.md b/docs/docs/hookless-api/LLMModule.md index 7656db14..d52e2e03 100644 --- a/docs/docs/hookless-api/LLMModule.md +++ b/docs/docs/hookless-api/LLMModule.md @@ -39,20 +39,25 @@ LLMModule.delete(); ### Methods -| Method | Type | Description | -| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------ | -| `load` | `(modelSource: ResourceSource, tokenizerSource: ResourceSource, systemPrompt?: string, contextWindowLength?: number): Promise` | Loads the model. Checkout the [loading the model](#loading-the-model) section for details. | -| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | -| `generate` | `(input: string): Promise` | Method to start generating a response with the given input string. | -| `onToken` | (callback: (data: string | undefined) => void): any | Subscribe to the token generation event. | -| `interrupt` | `(): void` | Method to interrupt the current inference | -| `delete` | `(): void` | Method to delete the model from memory. | +| Method | Type | Description | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | +| `load` | `LLMModule.load(modelSource: ResourceSource, tokenizerSource: ResourceSource, systemPrompt?: string, messageHistory?: MessageType[], contextWindowLength?: number): Promise` | Loads the model. Checkout the [loading the model](#loading-the-model) section for details. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | +| `generate` | `(input: string): Promise` | Method to start generating a response with the given input string. | +| `onToken` | (callback: (data: string | undefined) => void): any | Subscribe to the token generation event. | +| `interrupt` | `(): void` | Method to interrupt the current inference | +| `delete` | `(): void` | Method to delete the model from memory. |
Type definitions ```typescript type ResourceSource = string | number; + +interface MessageType { + role: 'user' | 'assistant'; + content: string; +} ```
@@ -64,6 +69,7 @@ To load the model, use the `load` method. It accepts: - `modelSource` - A string that specifies the location of the model binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. - `tokenizerSource` - URL to the binary file which contains the tokenizer - `systemPrompt` - Often used to tell the model what is its purpose, for example - "Be a helpful translator" +- `messageHistory` - An array of `MessageType` objects that represent the conversation history. This can be used to provide context to the model. - `contextWindowLength` - The number of messages from the current conversation that the model will use to generate a response. The higher the number, the more context the model will have. Keep in mind that using larger context windows will result in longer inference time and higher memory usage. This method returns a promise, which can resolve to an error or void. diff --git a/docs/docs/llms/running-llms.md b/docs/docs/llms/useLLM.md similarity index 86% rename from docs/docs/llms/running-llms.md rename to docs/docs/llms/useLLM.md index 36016f2c..cb32bff7 100644 --- a/docs/docs/llms/running-llms.md +++ b/docs/docs/llms/useLLM.md @@ -1,5 +1,5 @@ --- -title: Running LLMs +title: useLLM sidebar_position: 1 --- @@ -16,13 +16,62 @@ In order to load a model into the app, you need to run the following code: ```typescript import { useLLM, LLAMA3_2_1B } from 'react-native-executorch'; +const messageHistory = [ + { role: 'user', content: 'Hello' }, + { role: 'assistant', content: 'Hi, how can I help you?' }, +]; + const llama = useLLM({ modelSource: LLAMA3_2_1B, tokenizerSource: require('../assets/tokenizer.bin'), + systemPrompt: 'Be a helpful assistant', + messageHistory: messageHistory, contextWindowLength: 3, }); ``` +
+Type definitions + +```typescript +const useLLM: ({ + modelSource, + tokenizerSource, + systemPrompt, + messageHistory, + contextWindowLength, +}: { + modelSource: ResourceSource; + tokenizerSource: ResourceSource; + systemPrompt?: string; + messageHistory?: MessageType[]; + contextWindowLength?: number; +}) => Model; + +interface Model { + generate: (input: string) => Promise; + response: string; + downloadProgress: number; + error: string | null; + isModelGenerating: boolean; + isGenerating: boolean; + isModelReady: boolean; + isReady: boolean; + interrupt: () => void; +} + +type ResourceSource = string | number; + +interface MessageType { + role: 'user' | 'assistant'; + content: string; +} +``` + +
+ +
+ The code snippet above fetches the model from the specified URL, loads it into memory, and returns an object with various methods and properties for controlling the model. You can monitor the loading progress by checking the `llama.downloadProgress` and `llama.isReady` property, and if anything goes wrong, the `llama.error` property will contain the error message. :::danger @@ -39,9 +88,15 @@ Given computational constraints, our architecture is designed to support only on **`tokenizerSource`** - URL to the binary file which contains the tokenizer +**`systemPrompt`** - Often used to tell the model what is its purpose, for example - "Be a helpful translator" + +**`messageHistory`** - An array of `MessageType` objects that represent the conversation history. This can be used to provide context to the model. + **`contextWindowLength`** - The number of messages from the current conversation that the model will use to generate a response. The higher the number, the more context the model will have. Keep in mind that using larger context windows will result in longer inference time and higher memory usage. -**`systemPrompt`** - Often used to tell the model what is its purpose, for example - "Be a helpful translator" +:::note +Make sure that the reference to the `messageHistory` array is stable. Depending on your use case, you might use `useState` or `useRef` to store the message history. +::: ### Returns From 341d7a1eade40405bc10a7db842f8994e4d0d1a9 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 6 Mar 2025 09:15:16 +0100 Subject: [PATCH 7/8] docs: ocr (#105) ## Description ### Type of change - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [x] Documentation update (improves or adds clarity to existing documentation) ### Tested on - [ ] iOS - [ ] Android ### Testing instructions ### Screenshots ### Related issues ### Checklist - [x] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [x] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes --- docs/docs/benchmarks/inference-time.md | 22 ++ docs/docs/benchmarks/memory-usage.md | 13 ++ docs/docs/benchmarks/model-size.md | 18 ++ docs/docs/computer-vision/useOCR.md | 193 ++++++++++++++++ docs/docs/computer-vision/useVerticalOCR.md | 214 ++++++++++++++++++ .../docs/hookless-api/ClassificationModule.md | 2 +- docs/docs/hookless-api/OCRModule.md | 93 ++++++++ .../hookless-api/ObjectDetectionModule.md | 2 +- docs/docs/hookless-api/StyleTransferModule.md | 2 +- docs/docs/hookless-api/VerticalOCRModule.md | 107 +++++++++ docs/docs/module-api/executorch-bindings.md | 6 +- 11 files changed, 666 insertions(+), 6 deletions(-) create mode 100644 docs/docs/computer-vision/useOCR.md create mode 100644 docs/docs/computer-vision/useVerticalOCR.md create mode 100644 docs/docs/hookless-api/OCRModule.md create mode 100644 docs/docs/hookless-api/VerticalOCRModule.md diff --git a/docs/docs/benchmarks/inference-time.md b/docs/docs/benchmarks/inference-time.md index c1f91a3b..45c408a8 100644 --- a/docs/docs/benchmarks/inference-time.md +++ b/docs/docs/benchmarks/inference-time.md @@ -28,6 +28,28 @@ Times presented in the tables are measured as consecutive runs of the model. Ini | STYLE_TRANSFER_UDNIE | 450 | 600 | 750 | 1650 | 1800 | | STYLE_TRANSFER_RAIN_PRINCESS | 450 | 600 | 750 | 1650 | 1800 | +## OCR + +| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | +| ----------- | ---------------------------- | -------------------------------- | -------------------------- | --------------------------------- | --------------------------------- | +| CRAFT_800 | 2099 | 2227 | ❌ | 2245 | 7108 | +| CRNN_EN_512 | 70 | 252 | ❌ | 54 | 151 | +| CRNN_EN_256 | 39 | 123 | ❌ | 24 | 78 | +| CRNN_EN_128 | 17 | 83 | ❌ | 14 | 39 | + +❌ - Insufficient RAM. + +## Vertical OCR + +| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | +| ----------- | ---------------------------- | -------------------------------- | -------------------------- | --------------------------------- | --------------------------------- | +| CRAFT_1280 | 5457 | 5833 | ❌ | 6296 | 14053 | +| CRAFT_320 | 1351 | 1460 | ❌ | 1485 | 3101 | +| CRNN_EN_512 | 39 | 123 | ❌ | 24 | 78 | +| CRNN_EN_64 | 10 | 33 | ❌ | 7 | 18 | + +❌ - Insufficient RAM. + ## LLMs | Model | iPhone 16 Pro (XNNPACK) [tokens/s] | iPhone 13 Pro (XNNPACK) [tokens/s] | iPhone SE 3 (XNNPACK) [tokens/s] | Samsung Galaxy S24 (XNNPACK) [tokens/s] | OnePlus 12 (XNNPACK) [tokens/s] | diff --git a/docs/docs/benchmarks/memory-usage.md b/docs/docs/benchmarks/memory-usage.md index 868a0884..2f535ad4 100644 --- a/docs/docs/benchmarks/memory-usage.md +++ b/docs/docs/benchmarks/memory-usage.md @@ -24,6 +24,19 @@ sidebar_position: 2 | STYLE_TRANSFER_UDNIE | 950 | 350 | | STYLE_TRANSFER_RAIN_PRINCESS | 950 | 350 | +## OCR + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| --------------------------------------------------- | ---------------------- | ------------------ | +| CRAFT_800 + CRNN_EN_512 + CRNN_EN_256 + CRNN_EN_128 | 2100 | 1782 | + +## Vertical OCR + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------ | ---------------------- | ------------------ | +| CRAFT_1280 + CRAFT_320 + CRNN_EN_512 | 2770 | 3720 | +| CRAFT_1280 + CRAFT_320 + CRNN_EN_64 | 1770 | 2740 | + ## LLMs | Model | Android (XNNPACK) [GB] | iOS (XNNPACK) [GB] | diff --git a/docs/docs/benchmarks/model-size.md b/docs/docs/benchmarks/model-size.md index a80f59d4..59f1d9bd 100644 --- a/docs/docs/benchmarks/model-size.md +++ b/docs/docs/benchmarks/model-size.md @@ -24,6 +24,24 @@ sidebar_position: 1 | STYLE_TRANSFER_UDNIE | 6.78 | 5.22 | | STYLE_TRANSFER_RAIN_PRINCESS | 6.78 | 5.22 | +## OCR + +| Model | XNNPACK [MB] | +| ----------- | ------------ | +| CRAFT_800 | 83.1 | +| CRNN_EN_512 | 547 | +| CRNN_EN_256 | 277 | +| CRNN_EN_128 | 142 | + +## Vertical OCR + +| Model | XNNPACK [MB] | +| ----------- | ------------ | +| CRAFT_1280 | 83.1 | +| CRAFT_320 | 83.1 | +| CRNN_EN_512 | 277 | +| CRNN_EN_64 | 74.3 | + ## LLMs | Model | XNNPACK [GB] | diff --git a/docs/docs/computer-vision/useOCR.md b/docs/docs/computer-vision/useOCR.md new file mode 100644 index 00000000..e2431f49 --- /dev/null +++ b/docs/docs/computer-vision/useOCR.md @@ -0,0 +1,193 @@ +--- +title: useOCR +sidebar_position: 4 +--- + +Optical character recognition(OCR) is a computer vision technique that detects and recognizes text within the image. It's commonly used to convert different types of documents, such as scanned paper documents, PDF files, or images captured by a digital camera, into editable and searchable data. + +:::caution +It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/software-mansion). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/765305abc289083787eb9613b899d6fcc0e24126/src/constants/modelUrls.ts#L51) shipped with our library. +::: + +## Reference + +```jsx +import { + useOCR, + CRAFT_800, + RECOGNIZER_EN_CRNN_512, + RECOGNIZER_EN_CRNN_256, + RECOGNIZER_EN_CRNN_128 +} from 'react-native-executorch'; + +function App() { + const model = useOCR({ + detectorSource: CRAFT_800, + recognizerSources: { + recognizerLarge: RECOGNIZER_EN_CRNN_512, + recognizerMedium: RECOGNIZER_EN_CRNN_256, + recognizerSmall: RECOGNIZER_EN_CRNN_128 + }, + language: "en", + }); + + ... + for (const ocrDetection of await model.forward("https://url-to-image.jpg")) { + console.log("Bounding box: ", ocrDetection.bbox); + console.log("Bounding label: ", ocrDetection.text); + console.log("Bounding score: ", ocrDetection.score); + } + ... +} +``` + +
+Type definitions + +```typescript +interface RecognizerSources { + recognizerLarge: string | number; + recognizerMedium: string | number; + recognizerSmall: string | number; +} + +type OCRLanguage = 'en'; + +interface Point { + x: number; + y: number; +} + +interface OCRDetection { + bbox: Point[]; + text: string; + score: number; +} +``` + +
+ +### Arguments + +**`detectorSource`** - A string that specifies the location of the detector binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`recognizerSources`** - An object that specifies locations of the recognizers binary files. Each recognizer is composed of three models tailored to process images of varying widths. + +- `recognizerLarge` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 512 pixels. +- `recognizerMedium` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 256 pixels. +- `recognizerSmall` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 128 pixels. + +For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`language`** - A parameter that specifies the language of the text to be recognized by the OCR. + +### Returns + +The hook returns an object with the following properties: + +| Field | Type | Description | +| ------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | + +## Running the model + +To run the model, you can use the `forward` method. It accepts one argument, which is the image. The image can be a remote URL, a local file URI, or a base64-encoded image. The function returns an array of `OCRDetection` objects. Each object contains coordinates of the bounding box, the text recognized within the box, and the confidence score. For more information, please refer to the reference or type definitions. + +## Detection object + +The detection object is specified as follows: + +```typescript +interface Point { + x: number; + y: number; +} + +interface OCRDetection { + bbox: Point[]; + text: string; + score: number; +} +``` + +The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box. +The `text` property contains the text recognized within detected text region. The `score` represents the confidence score of the recognized text. + +## Example + +```tsx +import { + useOCR, + CRAFT_800, + RECOGNIZER_EN_CRNN_512, + RECOGNIZER_EN_CRNN_256, + RECOGNIZER_EN_CRNN_128, +} from 'react-native-executorch'; + +function App() { + const model = useOCR({ + detectorSource: CRAFT_800, + recognizerSources: { + recognizerLarge: RECOGNIZER_EN_CRNN_512, + recognizerMedium: RECOGNIZER_EN_CRNN_256, + recognizerSmall: RECOGNIZER_EN_CRNN_128, + }, + language: 'en', + }); + + const runModel = async () => { + const ocrDetections = await model.forward('https://url-to-image.jpg'); + + for (const ocrDetection of ocrDetections) { + console.log('Bounding box: ', ocrDetection.bbox); + console.log('Bounding text: ', ocrDetection.text); + console.log('Bounding score: ', ocrDetection.score); + } + }; +} +``` + +## Supported models + +| Model | Type | +| ------------------------------------------------------ | ---------- | +| [CRAFT_800](https://github.com/clovaai/CRAFT-pytorch) | Detector | +| [CRNN_EN_512](https://www.jaided.ai/easyocr/modelhub/) | Recognizer | +| [CRNN_EN_256](https://www.jaided.ai/easyocr/modelhub/) | Recognizer | +| [CRNN_EN_128](https://www.jaided.ai/easyocr/modelhub/) | Recognizer | + +## Benchmarks + +### Model size + +| Model | XNNPACK [MB] | +| ----------- | ------------ | +| CRAFT_800 | 83.1 | +| CRNN_EN_512 | 547 | +| CRNN_EN_256 | 277 | +| CRNN_EN_128 | 142 | + +### Memory usage + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| --------------------------------------------------- | ---------------------- | ------------------ | +| CRAFT_800 + CRNN_EN_512 + CRNN_EN_256 + CRNN_EN_128 | 2100 | 1782 | + +### Inference time + +:::warning warning +Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. +::: + +| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | +| ----------- | ---------------------------- | -------------------------------- | -------------------------- | --------------------------------- | --------------------------------- | +| CRAFT_800 | 2099 | 2227 | ❌ | 2245 | 7108 | +| CRNN_EN_512 | 70 | 252 | ❌ | 54 | 151 | +| CRNN_EN_256 | 39 | 123 | ❌ | 24 | 78 | +| CRNN_EN_128 | 17 | 83 | ❌ | 14 | 39 | + +❌ - Insufficient RAM. diff --git a/docs/docs/computer-vision/useVerticalOCR.md b/docs/docs/computer-vision/useVerticalOCR.md new file mode 100644 index 00000000..8fb82d50 --- /dev/null +++ b/docs/docs/computer-vision/useVerticalOCR.md @@ -0,0 +1,214 @@ +--- +title: useVerticalOCR +sidebar_position: 5 +--- + +:::danger Experimental +The `useVerticalOCR` hook is currently in an experimental phase. We appreciate feedback from users as we continue to refine and enhance its functionality. +::: + +Optical Character Recognition (OCR) is a computer vision technique used to detect and recognize text within images. It is commonly utilized to convert a variety of documents, such as scanned paper documents, PDF files, or images captured by a digital camera, into editable and searchable data. Traditionally, OCR technology has been optimized for recognizing horizontal text, and integrating support for vertical text recognition often requires significant additional effort from developers. To simplify this, we introduce `useVerticalOCR`, a tool designed to abstract the complexities of vertical text OCR, enabling seamless integration into your applications. + +:::caution +It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/software-mansion). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/765305abc289083787eb9613b899d6fcc0e24126/src/constants/modelUrls.ts#L51) shipped with our library. +::: + +## Reference + +```jsx +import { + DETECTOR_CRAFT_1280, + DETECTOR_CRAFT_320, + RECOGNIZER_EN_CRNN_512, + RECOGNIZER_EN_CRNN_64, + useVerticalOCR, +} from 'react-native-executorch'; + +function App() { + const model = useVerticalOCR({ + detectorSources: { + detectorLarge: DETECTOR_CRAFT_1280, + detectorNarrow: DETECTOR_CRAFT_320, + }, + recognizerSources: { + recognizerLarge: RECOGNIZER_EN_CRNN_512, + recognizerSmall: RECOGNIZER_EN_CRNN_64, + }, + language: 'en', + independentCharacters: true, + }); + + ... + for (const ocrDetection of await model.forward("https://url-to-image.jpg")) { + console.log("Bounding box: ", ocrDetection.bbox); + console.log("Bounding label: ", ocrDetection.text); + console.log("Bounding score: ", ocrDetection.score); + } + ... +} +``` + +
+Type definitions + +```typescript +interface DetectorSources { + detectorLarge: string | number; + detectorNarrow: string | number; +} + +interface RecognizerSources { + recognizerLarge: string | number; + recognizerSmall: string | number; +} + +type OCRLanguage = 'en'; + +interface Point { + x: number; + y: number; +} + +interface OCRDetection { + bbox: Point[]; + text: string; + score: number; +} +``` + +
+ +### Arguments + +**`detectorSources`** - An object that specifies the location of the detectors binary files. Each detector is composed of two models tailored to process images of varying widths. + +- `detectorLarge` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 1280 pixels. +- `detectorNarrow` - A string that specifies the location of the detector binary file which accepts input images with a width of 320 pixels. + +For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`recognizerSources`** - An object that specifies the locations of the recognizers binary files. Each recognizer is composed of two models tailored to process images of varying widths. + +- `recognizerLarge` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 512 pixels. +- `recognizerSmall` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 64 pixels. + +For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`language`** - A parameter that specifies the language of the text to be recognized by the OCR. + +**`independentCharacters`** – A boolean parameter that indicates whether the text in the image consists of a random sequence of characters. If set to true, the algorithm will scan each character individually instead of reading them as continuous text. + +### Returns + +The hook returns an object with the following properties: + +| Field | Type | Description | +| ------------------ | -------------------------------------------- | ------------------------------------------------------------------------------------------- | +| `forward` | `(input: string) => Promise` | A function that accepts an image (url, b64) and returns an array of `OCRDetection` objects. | +| `error` | string | null | Contains the error message if the model loading failed. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Represents the download progress as a value between 0 and 1. | + +## Running the model + +To run the model, you can use the `forward` method. It accepts one argument, which is the image. The image can be a remote URL, a local file URI, or a base64-encoded image. The function returns an array of `OCRDetection` objects. Each object contains coordinates of the bounding box, the text recognized within the box, and the confidence score. For more information, please refer to the reference or type definitions. + +## Detection object + +The detection object is specified as follows: + +```typescript +interface Point { + x: number; + y: number; +} + +interface OCRDetection { + bbox: Point[]; + text: string; + score: number; +} +``` + +The `bbox` property contains information about the bounding box of detected text regions. It is represented as four points, which are corners of detected bounding box. +The `text` property contains the text recognized withinh detected text region. The `score` represents the confidence score of the recognized text. + +## Example + +```tsx +import { + DETECTOR_CRAFT_1280, + DETECTOR_CRAFT_320, + RECOGNIZER_EN_CRNN_512, + RECOGNIZER_EN_CRNN_64, + useVerticalOCR, +} from 'react-native-executorch'; + +function App() { + const model = useVerticalOCR({ + detectorSources: { + detectorLarge: DETECTOR_CRAFT_1280, + detectorNarrow: DETECTOR_CRAFT_320, + }, + recognizerSources: { + recognizerLarge: RECOGNIZER_EN_CRNN_512, + recognizerSmall: RECOGNIZER_EN_CRNN_64, + }, + language: 'en', + independentCharacters: true, + }); + + const runModel = async () => { + const ocrDetections = await model.forward('https://url-to-image.jpg'); + + for (const ocrDetection of ocrDetections) { + console.log('Bounding box: ', ocrDetection.bbox); + console.log('Bounding text: ', ocrDetection.text); + console.log('Bounding score: ', ocrDetection.score); + } + }; +} +``` + +## Supported models + +| Model | Type | +| -------------------------------------------------------- | ---------- | +| [CRAFT_1280](https://github.com/clovaai/CRAFT-pytorch) | Detector | +| [CRAFT_NARROW](https://github.com/clovaai/CRAFT-pytorch) | Detector | +| [CRNN_EN_512](https://www.jaided.ai/easyocr/modelhub/) | Recognizer | +| [CRNN_EN_64](https://www.jaided.ai/easyocr/modelhub/) | Recognizer | + +## Benchmarks + +### Model size + +| Model | XNNPACK [MB] | +| ----------- | ------------ | +| CRAFT_1280 | 83.1 | +| CRAFT_320 | 83.1 | +| CRNN_EN_512 | 277 | +| CRNN_EN_64 | 74.3 | + +### Memory usage + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| ------------------------------------ | ---------------------- | ------------------ | +| CRAFT_1280 + CRAFT_320 + CRNN_EN_512 | 2770 | 3720 | +| CRAFT_1280 + CRAFT_320 + CRNN_EN_64 | 1770 | 2740 | + +### Inference time + +:::warning warning +Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization. +::: + +| Model | iPhone 16 Pro (XNNPACK) [ms] | iPhone 14 Pro Max (XNNPACK) [ms] | iPhone SE 3 (XNNPACK) [ms] | Samsung Galaxy S24 (XNNPACK) [ms] | Samsung Galaxy S21 (XNNPACK) [ms] | +| ----------- | ---------------------------- | -------------------------------- | -------------------------- | --------------------------------- | --------------------------------- | +| CRAFT_1280 | 5457 | 5833 | ❌ | 6296 | 14053 | +| CRAFT_320 | 1351 | 1460 | ❌ | 1485 | 3101 | +| CRNN_EN_512 | 39 | 123 | ❌ | 24 | 78 | +| CRNN_EN_64 | 10 | 33 | ❌ | 7 | 18 | + +❌ - Insufficient RAM. diff --git a/docs/docs/hookless-api/ClassificationModule.md b/docs/docs/hookless-api/ClassificationModule.md index 732971db..2e62cbd4 100644 --- a/docs/docs/hookless-api/ClassificationModule.md +++ b/docs/docs/hookless-api/ClassificationModule.md @@ -3,7 +3,7 @@ title: ClassificationModule sidebar_position: 1 --- -Hookless implementation of the [useClassification](../computer-vision/useClassification.mdx) hook. +Hookless implementation of the [useClassification](../computer-vision/useClassification.md) hook. ## Reference diff --git a/docs/docs/hookless-api/OCRModule.md b/docs/docs/hookless-api/OCRModule.md new file mode 100644 index 00000000..49337119 --- /dev/null +++ b/docs/docs/hookless-api/OCRModule.md @@ -0,0 +1,93 @@ +--- +title: OCRModule +sidebar_position: 6 +--- + +Hookless implementation of the [useOCR](../computer-vision/useOCR.md) hook. + +## Reference + +```typescript +import { + OCRModule, + CRAFT_800, + RECOGNIZER_EN_CRNN_512, + RECOGNIZER_EN_CRNN_256, + RECOGNIZER_EN_CRNN_128, +} from 'react-native-executorch'; +const imageUri = 'path/to/image.png'; + +// Loading the model +await OCRModule.load({ + detectorSource: CRAFT_800, + recognizerSources: { + recognizerLarge: RECOGNIZER_EN_CRNN_512, + recognizerMedium: RECOGNIZER_EN_CRNN_256, + recognizerSmall: RECOGNIZER_EN_CRNN_128, + }, + language: 'en', +}); + +// Running the model +const ocrDetections = await OCRModule.forward(imageUri); +``` + +### Methods + +| Method | Type | Description | +| -------------------- | ------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `load` | `(detectorSource: string, recognizerSources: RecognizerSources, language: OCRLanguage): Promise` | Loads the detector and recognizers, which sources are represented by `RecognizerSources`. | +| `forward` | `(input: string): Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | + +
+Type definitions + +```typescript +interface RecognizerSources { + recognizerLarge: string | number; + recognizerMedium: string | number; + recognizerSmall: string | number; +} + +type OCRLanguage = 'en'; + +interface Point { + x: number; + y: number; +} + +interface OCRDetection { + bbox: Point[]; + text: string; + score: number; +} +``` + +
+ +## Loading the model + +To load the model, use the `load` method. It accepts: + +**`detectorSource`** - A string that specifies the location of the detector binary. For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`recognizerSources`** - An object that specifies locations of the recognizers binary files. Each recognizer is composed of three models tailored to process images of varying widths. + +- `recognizerLarge` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 512 pixels. +- `recognizerMedium` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 256 pixels. +- `recognizerSmall` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 128 pixels. + +For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`language`** - A parameter that specifies the language of the text to be recognized by the OCR. + +This method returns a promise, which can resolve to an error or void. + +## Listening for download progress + +To subscribe to the download progress event, you can use the `onDownloadProgress` method. It accepts a callback function that will be called whenever the download progress changes. + +## Running the model + +To run the model, you can use the `forward` method. It accepts one argument, which is the image. The image can be a remote URL, a local file URI, or a base64-encoded image. The method returns a promise, which can resolve either to an error or an array of `OCRDetection` objects. Each object contains coordinates of the bounding box, the label of the detected object, and the confidence score. diff --git a/docs/docs/hookless-api/ObjectDetectionModule.md b/docs/docs/hookless-api/ObjectDetectionModule.md index 2cc3504e..6c730b7f 100644 --- a/docs/docs/hookless-api/ObjectDetectionModule.md +++ b/docs/docs/hookless-api/ObjectDetectionModule.md @@ -3,7 +3,7 @@ title: ObjectDetectionModule sidebar_position: 5 --- -Hookless implementation of the [useObjectDetection](../computer-vision/useObjectDetection.mdx) hook. +Hookless implementation of the [useObjectDetection](../computer-vision/useObjectDetection.md) hook. ## Reference diff --git a/docs/docs/hookless-api/StyleTransferModule.md b/docs/docs/hookless-api/StyleTransferModule.md index f084d8ca..29c750be 100644 --- a/docs/docs/hookless-api/StyleTransferModule.md +++ b/docs/docs/hookless-api/StyleTransferModule.md @@ -3,7 +3,7 @@ title: StyleTransferModule sidebar_position: 4 --- -Hookless implementation of the [useStyleTransfer](../computer-vision/useStyleTransfer.mdx) hook. +Hookless implementation of the [useStyleTransfer](../computer-vision/useStyleTransfer.md) hook. ## Reference diff --git a/docs/docs/hookless-api/VerticalOCRModule.md b/docs/docs/hookless-api/VerticalOCRModule.md new file mode 100644 index 00000000..d876b827 --- /dev/null +++ b/docs/docs/hookless-api/VerticalOCRModule.md @@ -0,0 +1,107 @@ +--- +title: VerticalOCRModule +sidebar_position: 7 +--- + +Hookless implementation of the [useVerticalOCR](../computer-vision/useVerticalOCR.md) hook. + +## Reference + +```typescript +import { + DETECTOR_CRAFT_1280, + DETECTOR_CRAFT_320, + RECOGNIZER_EN_CRNN_512, + RECOGNIZER_EN_CRNN_64, + useVerticalOCR, +} from 'react-native-executorch'; + +const imageUri = 'path/to/image.png'; + +// Loading the model +await VerticalOCRModule.load({ + detectorSources: { + detectorLarge: DETECTOR_CRAFT_1280, + detectorNarrow: DETECTOR_CRAFT_320, + }, + recognizerSources: { + recognizerLarge: RECOGNIZER_EN_CRNN_512, + recognizerSmall: RECOGNIZER_EN_CRNN_64, + }, + language: 'en', + independentCharacters: true, +}); + +// Running the model +const ocrDetections = await VerticalOCRModule.forward(imageUri); +``` + +### Methods + +| Method | Type | Description | +| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | +| `load` | `(detectorSources: DetectorSources, recognizerSources: RecognizerSources, language: OCRLanguage independentCharacters: boolean): Promise` | Loads detectors and recognizers, which sources are represented by `DetectorSources` and `RecognizerSources`. | +| `forward` | `(input: string): Promise` | Executes the model's forward pass, where `input` can be a fetchable resource or a Base64-encoded string. | +| `onDownloadProgress` | `(callback: (downloadProgress: number) => void): any` | Subscribe to the download progress event. | + +
+Type definitions + +```typescript +interface DetectorSources { + detectorLarge: string | number; + detectorNarrow: string | number; +} + +interface RecognizerSources { + recognizerLarge: string | number; + recognizerSmall: string | number; +} + +type OCRLanguage = 'en'; + +interface Point { + x: number; + y: number; +} + +interface OCRDetection { + bbox: Point[]; + text: string; + score: number; +} +``` + +
+ +## Loading the model + +To load the model, use the `load` method. It accepts: + +**`detectorSources`** - An object that specifies the location of the detectors binary files. Each detector is composed of two models tailored to process images of varying widths. + +- `detectorLarge` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 1280 pixels. +- `detectorNarrow` - A string that specifies the location of the detector binary file which accepts input images with a width of 320 pixels. + +For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`recognizerSources`** - An object that specifies the locations of the recognizers binary files. Each recognizer is composed of two models tailored to process images of varying widths. + +- `recognizerLarge` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 512 pixels. +- `recognizerSmall` - A string that specifies the location of the recognizer binary file which accepts input images with a width of 64 pixels. + +For more information, take a look at [loading models](../fundamentals/loading-models.md) section. + +**`language`** - A parameter that specifies the language of the text to be recognized by the OCR. + +**`independentCharacters`** – A boolean parameter that indicates whether the text in the image consists of a random sequence of characters. If set to true, the algorithm will scan each character individually instead of reading them as continuous text. + +This method returns a promise, which can resolve to an error or void. + +## Listening for download progress + +To subscribe to the download progress event, you can use the `onDownloadProgress` method. It accepts a callback function that will be called whenever the download progress changes. + +## Running the model + +To run the model, you can use the `forward` method. It accepts one argument, which is the image. The image can be a remote URL, a local file URI, or a base64-encoded image. The method returns a promise, which can resolve either to an error or an array of `OCRDetection` objects. Each object contains coordinates of the bounding box, the label of the detected object, and the confidence score. diff --git a/docs/docs/module-api/executorch-bindings.md b/docs/docs/module-api/executorch-bindings.md index 282beaf5..e2e48ab6 100644 --- a/docs/docs/module-api/executorch-bindings.md +++ b/docs/docs/module-api/executorch-bindings.md @@ -61,7 +61,7 @@ To run model with ExecuTorch Bindings it's essential to specify the shape of the This example demonstrates the integration and usage of the ExecuTorch bindings with a [style transfer model](../computer-vision/useStyleTransfer.md). Specifically, we'll be using the `STYLE_TRANSFER_CANDY` model, which applies artistic style transfer to an input image. -## Importing the Module and loading the model +### Importing the Module and loading the model First, import the necessary functions from the `react-native-executorch` package and initialize the ExecuTorch module with the specified style transfer model. @@ -77,7 +77,7 @@ const executorchModule = useExecutorchModule({ }); ``` -## Setting up input parameters +### Setting up input parameters To prepare the input for the model, define the shape of the input tensor. This shape depends on the model's requirements. For the `STYLE_TRANSFER_CANDY` model, we need a tensor of shape `[1, 3, 640, 640]`, corresponding to a batch size of 1, 3 color channels (RGB), and dimensions of 640x640 pixels. @@ -88,7 +88,7 @@ const shape = [1, 3, 640, 640]; const input = new Float32Array(1 * 3 * 640 * 640); // fill this array with your image data ``` -## Performing inference +### Performing inference ```typescript try { From 83fd31f53f972f67febfd42ff453b50a34a60e27 Mon Sep 17 00:00:00 2001 From: Jakub Chmura <92989966+chmjkb@users.noreply.github.com> Date: Thu, 6 Mar 2025 16:05:56 +0100 Subject: [PATCH 8/8] docs: Speech to Text (#111) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description ### Type of change - [ ] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) - [x] Documentation update (improves or adds clarity to existing documentation) ### Tested on - [ ] iOS - [ ] Android ### Testing instructions ### Screenshots ### Related issues ### Checklist - [ ] I have performed a self-review of my code - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] I have updated the documentation accordingly - [ ] My changes generate no new warnings ### Additional notes --------- Co-authored-by: jakmro Co-authored-by: Jakub Mroz <115979017+jakmro@users.noreply.github.com> Co-authored-by: Mateusz Kopciński --- docs/docs/benchmarks/_category_.json | 2 +- docs/docs/benchmarks/memory-usage.md | 7 ++ docs/docs/benchmarks/model-size.md | 7 ++ docs/docs/computer-vision/_category_.json | 2 +- docs/docs/hookless-api/LLMModule.md | 2 +- docs/docs/hookless-api/SpeechToTextModule.md | 55 ++++++++ docs/docs/hookless-api/_category_.json | 2 +- docs/docs/module-api/_category_.json | 2 +- docs/docs/speech-to-text/_category_.json | 7 ++ docs/docs/speech-to-text/useSpeechToText.md | 125 +++++++++++++++++++ docs/docs/utils/_category_.json | 2 +- 11 files changed, 207 insertions(+), 6 deletions(-) create mode 100644 docs/docs/hookless-api/SpeechToTextModule.md create mode 100644 docs/docs/speech-to-text/_category_.json create mode 100644 docs/docs/speech-to-text/useSpeechToText.md diff --git a/docs/docs/benchmarks/_category_.json b/docs/docs/benchmarks/_category_.json index 001b3495..e7903346 100644 --- a/docs/docs/benchmarks/_category_.json +++ b/docs/docs/benchmarks/_category_.json @@ -1,6 +1,6 @@ { "label": "Benchmarks", - "position": 7, + "position": 8, "link": { "type": "generated-index" } diff --git a/docs/docs/benchmarks/memory-usage.md b/docs/docs/benchmarks/memory-usage.md index 2f535ad4..958935ea 100644 --- a/docs/docs/benchmarks/memory-usage.md +++ b/docs/docs/benchmarks/memory-usage.md @@ -47,3 +47,10 @@ sidebar_position: 2 | LLAMA3_2_3B | 7.1 | 7.3 | | LLAMA3_2_3B_SPINQUANT | 3.7 | 3.8 | | LLAMA3_2_3B_QLORA | 4 | 4.1 | + +## Speech to text + +| Model | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] | +| -------------- | ---------------------- | ------------------ | +| WHISPER_TINY | 900 | 600 | +| MOONSHINE_TINY | 650 | 560 | diff --git a/docs/docs/benchmarks/model-size.md b/docs/docs/benchmarks/model-size.md index 59f1d9bd..672a607e 100644 --- a/docs/docs/benchmarks/model-size.md +++ b/docs/docs/benchmarks/model-size.md @@ -52,3 +52,10 @@ sidebar_position: 1 | LLAMA3_2_3B | 6.43 | | LLAMA3_2_3B_SPINQUANT | 2.55 | | LLAMA3_2_3B_QLORA | 2.65 | + +## Speech to text + +| Model | XNNPACK [MB] | +| -------------- | ------------ | +| WHISPER_TINY | 231.0 | +| MOONSHINE_TINY | 148.9 | diff --git a/docs/docs/computer-vision/_category_.json b/docs/docs/computer-vision/_category_.json index 5aa6c026..1a78d5e7 100644 --- a/docs/docs/computer-vision/_category_.json +++ b/docs/docs/computer-vision/_category_.json @@ -1,6 +1,6 @@ { "label": "Computer Vision", - "position": 3, + "position": 4, "link": { "type": "generated-index" } diff --git a/docs/docs/hookless-api/LLMModule.md b/docs/docs/hookless-api/LLMModule.md index d52e2e03..037b151b 100644 --- a/docs/docs/hookless-api/LLMModule.md +++ b/docs/docs/hookless-api/LLMModule.md @@ -3,7 +3,7 @@ title: LLMModule sidebar_position: 3 --- -Hookless implementation of the [useLLM](../llms/running-llms.md) hook. +Hookless implementation of the [useLLM](../llms/useLLM.md) hook. ## Reference diff --git a/docs/docs/hookless-api/SpeechToTextModule.md b/docs/docs/hookless-api/SpeechToTextModule.md new file mode 100644 index 00000000..2438c843 --- /dev/null +++ b/docs/docs/hookless-api/SpeechToTextModule.md @@ -0,0 +1,55 @@ +--- +title: SpeechToTextModule +sidebar_position: 6 +--- + +Hookless implementation of the [useSpeechToText](../speech-to-text/) hook. + +## Reference + +```typescript +import { SpeechToTextModule } from 'react-native-executorch'; + +const audioUrl = 'https://www.your-url.com/cool-audio.mp3'; + +// Loading the model +const onSequenceUpdate = (sequence) => { + console.log(sequence); +}; +await SpeechToTextModule.load('moonshine', onSequenceUpdate); + +// Loading the audio and running the model +await SpeechToTextModule.loadAudio(audioUrl); +const transcribedText = await SpeechToTextModule.transcribe(); +``` + +### Methods + +| Method | Type | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `load` | (modelName: 'whisper' | 'moonshine, transcribeCallback?: (sequence: string) => void, modelDownloadProgressCalback?: (downloadProgress: number) => void, encoderSource?: ResourceSource, decoderSource?: ResourceSource, tokenizerSource?: ResourceSource) | Loads the model specified with `modelName`, where `encoderSource`, `decoderSource`, `tokenizerSource` are strings specifying the location of the binaries for the models. `modelDownloadProgressCallback` allows you to monitor the current progress of the model download, while `transcribeCallback` is invoked with each generated token | +| `transcribe` | `(waveform: number[]): Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. | +| `loadAudio` | `(url: string) => void` | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`. | +| `encode` | `(waveform: number[]) => Promise` | Runs the encoding part of the model. Returns a float array representing the output of the encoder. | +| `decode` | `(tokens: number[], encodings: number[]) => Promise` | Runs the decoder of the model. Returns a single token representing a next token in the output sequence. | + +
+Type definitions + +```typescript +type ResourceSource = string | number; +``` + +
+ +## Loading the model + +To load the model, use the `load` method. The required argument is `modelName`, which serves as an identifier for which model to use. It also accepts accepts optional arguments such as `encoderSource`, `decoderSource`, `tokenizerSource` which are strings that specify the location of the binaries for the model. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void. + +## Running the model + +To run the model, you can use the `transcribe` method. It accepts one argument, which is an array of numbers representing a waveform at 16kHz sampling rate. The method returns a promise, which can resolve either to an error or a string containing the output text. + +## Obtaining the input + +To get the input, you can use the `loadAudio` method, which sets the internal input state of the model. Then you can just call `transcribe` without passing any args. It is also possible to pass inputs from other sources, as long as it is a float array containing the aforementioned waveform. diff --git a/docs/docs/hookless-api/_category_.json b/docs/docs/hookless-api/_category_.json index e96f5186..6c0a8908 100644 --- a/docs/docs/hookless-api/_category_.json +++ b/docs/docs/hookless-api/_category_.json @@ -1,6 +1,6 @@ { "label": "Hookless API", - "position": 4, + "position": 5, "link": { "type": "generated-index" } diff --git a/docs/docs/module-api/_category_.json b/docs/docs/module-api/_category_.json index b0400018..8cc82679 100644 --- a/docs/docs/module-api/_category_.json +++ b/docs/docs/module-api/_category_.json @@ -1,6 +1,6 @@ { "label": "Module API", - "position": 5, + "position": 6, "link": { "type": "generated-index" } diff --git a/docs/docs/speech-to-text/_category_.json b/docs/docs/speech-to-text/_category_.json new file mode 100644 index 00000000..554e3476 --- /dev/null +++ b/docs/docs/speech-to-text/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Speech To Text", + "position": 3, + "link": { + "type": "generated-index" + } +} diff --git a/docs/docs/speech-to-text/useSpeechToText.md b/docs/docs/speech-to-text/useSpeechToText.md new file mode 100644 index 00000000..6cde2e04 --- /dev/null +++ b/docs/docs/speech-to-text/useSpeechToText.md @@ -0,0 +1,125 @@ +--- +title: useSpeechToText +sidebar_position: 1 +--- + +With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. As of now, [all supported STT models](#supported-models) run on the XNNPACK backend. + +:::info +Currently, we do not support direct microphone input streaming to the model. Instead, in v0.3.0, we provide a way to transcribe an audio file. +::: + +:::caution +It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/software-mansion/react-native-executorch-moonshine-tiny). You can also use [constants](https://github.com/software-mansion/react-native-executorch/tree/main/src/constants/modelUrls.ts) shipped with our library +::: + +## Reference + +```typescript +import { useSpeechToText } from 'react-native-executorch'; + +const { transcribe, error, loadAudio } = useSpeechToText({ + modelName: 'moonshine', +}); + +const audioUrl = ...; // URL with audio to transcribe + +await loadAudio(audioUrl); +const transcription = await transcribe(); +if (error) { + console.log(error); +} else { + console.log(transcription); +} +``` + +### Streaming + +Given that STT models can process audio no longer than 30 seconds, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm (adapted for mobile devices from [whisper-streaming](https://aclanthology.org/2023.ijcnlp-demo.3.pdf)) that uses overlapping audio chunks. This might introduce some overhead, but allows for processing audio inputs of arbitrary length. + +### Arguments + +**`modelName`** +A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used. + +**`encoderSource?`** +A string that specifies the location of a .pte file for the encoder. For further information on passing model sources, check out [Loading Models](https://docs.swmansion.com/react-native-executorch/docs/fundamentals/loading-models). Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model. + +**`decoderSource?`** +Analogous to the encoderSource, this takes in a string which is a source for the decoder part of the model. Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model. + +**`tokenizerSource?`** +A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do. Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model. + +**`overlapSeconds?`** +Specifies the length of overlap between consecutive audio chunks (expressed in seconds). + +**`windowSize?`** +Specifies the size of each audio chunk (expressed in seconds). + +### Returns + +| Field | Type | Description | +| ------------------ | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(input?: number[]) => Promise` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. | +| `loadAudio` | `(url: string) => void` | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`. | +| `error` | Error | undefined | Contains the error message if the model failed to load. | +| `sequence` | string | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | +| `downloadProgress` | `number` | Tracks the progress of the model download process. | + +## Running the model + +Before running the model's `transcribe` method be sure to obtain waveform of the audio You wish to transcribe. You can either use `loadAudio` method to load audio from a url and save it in model's internal state or obtain the waveform on your own (remember to use sampling rate of 16kHz!). In the latter case just pass the obtained waveform as argument to the `transcribe` method which returns a promise resolving to the generated tokens when successful. If the model fails during inference the `error` property contains details of the error. If you want to obtain tokens in a streaming fashion, you can also use the sequence property, which is updated with each generated token, similar to the [useLLM](../llms/useLLM.md) hook. + +## Example + +```typescript +import { Button, Text } from 'react-native'; +import { useSpeechToText } from 'react-native-executorch'; + +function App() { + const { loadAudio, transcribe, sequence, error } = useSpeechToText({ + modelName: 'whisper', + }); + + const audioUrl = ...; // URL with audio to transcribe + + return ( + +