opensearch-project
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎common/constants.ts‎
Lines changed: 57 additions & 0 deletions b/‎common/constants.ts‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎common/utils.ts‎
Lines changed: 1 addition & 0 deletions b/‎common/utils.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎documentation/models.md‎
Lines changed: 117 additions & 0 deletions b/‎documentation/models.md‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎documentation/tutorial.md‎
Lines changed: 43 additions & 0 deletions b/‎documentation/tutorial.md‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎package.json‎
Lines changed: 2 additions & 1 deletion b/‎package.json‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎public/pages/workflow_detail/component_input/component_input.tsx‎
Lines changed: 1 addition & 1 deletion b/‎public/pages/workflow_detail/component_input/component_input.tsx‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎public/pages/workflow_detail/component_input/ingest_inputs/advanced_settings.tsx‎
Lines changed: 3 additions & 2 deletions b/‎public/pages/workflow_detail/component_input/ingest_inputs/advanced_settings.tsx‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎public/pages/workflow_detail/component_input/ingest_inputs/ingest_data.tsx‎
Lines changed: 3 additions & 1 deletion b/‎public/pages/workflow_detail/component_input/ingest_inputs/ingest_data.tsx‎
Lines changed: 3 additions & 1 deletion
@@ -6,6 +6,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)
 ## [Unreleased 3.0](https://github.com/opensearch-project/anomaly-detection/compare/2.x...HEAD)
 ### Features
 ### Enhancements
+#### Added workflow preset for Semantic Search using Sparse Encoders (https://github.com/opensearch-project/dashboards-flow-framework/pull/742)
 ### Bug Fixes
 ### Infrastructure
 ### Documentation
 
@@ -155,6 +155,38 @@ export const OPENAI_CONFIGS = {
   } as RemoteEmbeddingModelConfig,
 };
 
+// Neural Sparse
+export const NEURAL_SPARSE_CONFIGS = {
+  [`opensearch-neural-sparse-encoding-v2-distill`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-v1`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-multilingual-v1`]: {
+    dimension: 105879,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-doc-v2-mini`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+    [`opensearch-neural-sparse-encoding-doc-v3-distill`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-doc-v1`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-doc-v2-distill`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+};
+
 /**
  * Various constants pertaining to Workflow configs
  */
@@ -173,6 +205,7 @@ export enum WORKFLOW_TYPE {
   HYBRID_SEARCH = 'Hybrid Search',
   VECTOR_SEARCH_WITH_RAG = 'RAG with Vector Retrieval',
   HYBRID_SEARCH_WITH_RAG = 'RAG with Hybrid Search',
+  SEMANTIC_SEARCH_USING_SPARSE_ENCODERS = 'Semantic Search using Sparse Encoders',
   CUSTOM = 'Custom Search',
   UNKNOWN = 'Unknown',
 }
@@ -211,6 +244,7 @@ export enum MODEL_TYPE {
 export enum MODEL_CATEGORY {
   EMBEDDING = 'EMBEDDING',
   LLM = 'LLM',
+  SPARSE_ENCODER = 'SPARSE_ENCODER',
 }
 
 /**
@@ -293,6 +327,13 @@ export const COHERE_EMBEDDING_MODEL_DOCS_LINK =
 export const BEDROCK_TITAN_EMBEDDING_DOCS_LINK =
   'https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#amazon-bedrock-titan-text-embedding';
 
+// Sparse Encoder Models Documentation Links
+export const OPENSEARCH_NEURAL_SPARSE_DOCS_LINK =
+  'https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v2-distill';
+
+export const SAGEMAKER_SPARSE_DEPLOY_LINK =
+  'https://github.com/zhichao-aws/opensearch-neural-sparse-sample/tree/main/examples/deploy_on_sagemaker';
+
 // ML Models setup Documentation Link
 export const ML_MODELS_SETUP_DOCS_LINK =
   'https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md';
@@ -595,6 +636,18 @@ export const HYBRID_SEARCH_QUERY_MATCH_TERM = {
     },
   },
 };
+export const NEURAL_SPARSE_SEARCH_QUERY = {
+  _source: {
+    excludes: [VECTOR_FIELD_PATTERN],
+  },
+  query: {
+    neural_sparse: {
+      [VECTOR_FIELD_PATTERN]: {
+        query_tokens: VECTOR_PATTERN,
+      },
+    },
+  },
+};
 
 export const QUERY_PRESETS = [
   {
@@ -649,6 +702,10 @@ export const QUERY_PRESETS = [
     name: WORKFLOW_TYPE.MULTIMODAL_SEARCH,
     query: customStringify(MULTIMODAL_SEARCH_QUERY_BOOL),
   },
+  {
+    name: 'Neural Sparse Search Query',
+    query: customStringify(NEURAL_SPARSE_SEARCH_QUERY),
+  },
   {
     name: 'Semantic search (neural query)',
     query: customStringify(SEMANTIC_SEARCH_QUERY_NEURAL),
 
@@ -53,6 +53,7 @@ export function isVectorSearchUseCase(workflowType?: WORKFLOW_TYPE): boolean {
       WORKFLOW_TYPE.HYBRID_SEARCH,
       WORKFLOW_TYPE.VECTOR_SEARCH_WITH_RAG,
       WORKFLOW_TYPE.HYBRID_SEARCH_WITH_RAG,
+      WORKFLOW_TYPE.SEMANTIC_SEARCH_USING_SPARSE_ENCODERS,
     ].includes(workflowType)
   );
 }
 
@@ -473,6 +473,123 @@ POST /_plugins/_ml/models/_register
 }
 ```
 
+### Neural Sparse Encoding
+
+Deploy a sparse encoding model from the Hugging Face Model Hub to a SageMaker real-time inference endpoint using this [guide](https://github.com/zhichao-aws/opensearch-neural-sparse-sample/tree/main/examples/deploy_on_sagemaker).
+
+Connector:
+
+```
+POST /_plugins/_ml/connectors/_create
+{
+  "name": "Neural Sparse Encoding",
+  "description": "Test connector for Sagemaker model",
+  "version": 1,
+  "protocol": "aws_sigv4",
+  "credential": {
+        "access_key": "",
+        "secret_key": "",
+        "session_token": ""
+    },
+  "parameters": {
+    "region": "us-east-1",
+    "service_name": "sagemaker",
+    "model": "opensearch-neural-sparse-encoding-v2-distill"
+  },
+  "actions": [
+    {
+        "action_type": "predict",
+        "method": "POST",
+        "headers": {
+            "content-type": "application/json"
+        },
+        "url": "https://runtime.sagemaker.us-east-1.amazonaws.com/endpoints/xxxx/invocations",
+        "request_body": "[\"${parameters.text_doc}\"]",
+        "post_process_function": "String escape(def input) { if (input instanceof String) { if (input.contains('\\\\')) { input = input.replace('\\\\', '\\\\\\\\'); } if (input.contains('\"')) { input = input.replace('\"', '\\\\\"'); } if (input.contains('\r')) { input = input.replace('\r', '\\\\r'); } if (input.contains('\t')) { input = input.replace('\t', '\\\\t'); } if (input.contains('\n')) { input = input.replace('\n', '\\\\n'); } if (input.contains('\b')) { input = input.replace('\b', '\\\\b'); } if (input.contains('\f')) { input = input.replace('\f', '\\\\f'); } return input; } return input.toString(); } if (params.result == null || params.result.length == 0) { return '{\"dataAsMap\":{\"error\":\"no response error\"}}'; } String response = params.result[0].toString(); response = response.substring(1, response.length() - 1).replace('=', '\":').replace(', ', ',\"'); return '{\"dataAsMap\":{\"response\":{\"' + response + '}}}';"
+        }
+    ]
+}
+```
+
+Model:
+
+```
+POST /_plugins/_ml/models/_register
+{ "name": "Neural Sparse Encoding Model",
+  "function_name": "remote",
+  "version": "1.0.0",
+  "connector_id": "<connector-id>",
+  "description": "Test connector for Sagemaker model",
+  "interface": {
+    "input": {
+        "type": "object",
+        "properties": {
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text_doc": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": true,
+                "required": [
+                    "text_doc"
+                ]
+            }
+        }
+    },
+    "output": {
+        "type": "object",
+        "properties": {
+            "inference_results": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "output": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "dataAsMap": {
+                                        "type": "object",
+                                        "properties": {
+                                            "response": {
+                                                    "type": "object",
+                                                    "additionalProperties": {
+                                                        "type": "number"
+                                                    }
+                                                }
+                                        },
+                                        "required": [
+                                            "response"
+                                        ]
+                                    }
+                                },
+                                "required": [
+                                    "dataAsMap"
+                                ]
+                            }
+                        },
+                        "status_code": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "output",
+                        "status_code"
+                    ]
+                }
+            }
+        },
+        "required": [
+            "inference_results"
+        ]
+    }
+  }
+}
+```
+
 ## Generative models
 
 ### Claude 3 Sonnet (hosted on Amazon Bedrock)
 
@@ -491,3 +491,46 @@ Override the query to a knn query, including the embedding output. For example:
     }
 }
 ```
+
+---
+
+## 9. Neural sparse search
+
+### ML resources
+Create and deploy a [Neural Sparse Encoding model](https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#neural-sparse-encoding).
+
+### Index
+
+Ensure the index mappings have a `rank_features` field - something like the following:
+
+```
+"<embedding_field_name>": {
+    "type": "rank_features"
+}
+```
+
+### Ingest pipeline
+
+Single ML inference processor. Map your input text to the `text_doc` model input field. Optionally map the output `response` to a new document field. Transform the response if needed using JSONPath expression. 
+
+
+### Search pipeline
+
+Single ML inference **search request** processor. Map the query field containing the input text to the `text_doc` model input field. Optionally map the output `response` to a new field. Transform the response if needed using JSONPath expression. Override the query to a neural sparse query. For example:
+
+```
+{
+    "_source": {
+        "excludes": [
+            "<embedding_field>"
+        ]
+    },
+    "query": {
+        "neural_sparse": {
+            "<embedding_field>": {
+                "query_tokens": ${response},
+            }
+        }
+    }
+}
+```
@@ -31,7 +31,8 @@
     "formik": "2.4.2",
     "jsonpath": "^1.1.1",
     "reactflow": "^11.8.3",
-    "yup": "^1.3.2"
+    "yup": "^1.3.2",
+    "react-markdown": "^4.3.1"
   },
   "devDependencies": {},
   "resolutions": {}
 
@@ -319,7 +319,7 @@ export function ComponentInput(props: ComponentInputProps) {
                 </EuiFlexItem>
               </EuiFlexGroup>
             ) : props.selectedComponentId === COMPONENT_ID.INGEST_DATA ? (
-              <IngestData disabled={props.readonly} />
+              <IngestData disabled={props.readonly} workflowType={props.workflow?.ui_metadata?.type} />
             ) : props.selectedComponentId === COMPONENT_ID.SEARCH_REQUEST ? (
               <ConfigureSearchRequest disabled={props.readonly} />
             ) : props.selectedComponentId === COMPONENT_ID.RUN_QUERY ? (
 
@@ -14,7 +14,7 @@ import {
 } from '@elastic/eui';
 import { JsonField } from '../input_fields';
 import { getIn, useFormikContext } from 'formik';
-import { WorkflowFormValues } from '../../../../../common';
+import { WorkflowFormValues, WORKFLOW_TYPE } from '../../../../../common';
 import { AppState } from '../../../../store';
 import {
   getEmbeddingField,
@@ -28,6 +28,7 @@ import {
 
 interface AdvancedSettingsProps {
   setHasInvalidDimensions: (hasInvalidDimensions: boolean) => void;
+  workflowType: WORKFLOW_TYPE | undefined;
   disabled: boolean;
 }
 
@@ -65,7 +66,7 @@ export function AdvancedSettings(props: AdvancedSettingsProps) {
 
           // If a dimension is found, it is a known embedding model.
           // Ensure the index is configured to be knn-enabled.
-          if (dimension !== undefined) {
+          if (dimension !== undefined && props.workflowType !== WORKFLOW_TYPE.SEMANTIC_SEARCH_USING_SPARSE_ENCODERS) {
             if (!isKnnIndex(curSettings)) {
               setFieldValue(
                 indexSettingsPath,
 
@@ -13,10 +13,11 @@ import {
 } from '@elastic/eui';
 import { TextField } from '../input_fields';
 import { AdvancedSettings } from './advanced_settings';
-import { KNN_VECTOR_DOCS_LINK } from '../../../../../common';
+import { KNN_VECTOR_DOCS_LINK, WORKFLOW_TYPE } from '../../../../../common';
 
 interface IngestDataProps {
   disabled: boolean;
+  workflowType: WORKFLOW_TYPE | undefined;
 }
 
 /**
@@ -57,6 +58,7 @@ export function IngestData(props: IngestDataProps) {
       <EuiFlexItem>
         <AdvancedSettings
           setHasInvalidDimensions={setHasInvalidDimensions}
+          workflowType={props.workflowType} 
           disabled={props.disabled}
         />
       </EuiFlexItem>
Original file line number	Diff line number	Diff line change
`@@ -53,6 +53,7 @@ export function isVectorSearchUseCase(workflowType?: WORKFLOW_TYPE): boolean {`
`53`	`53`	`WORKFLOW_TYPE.HYBRID_SEARCH,`
`54`	`54`	`WORKFLOW_TYPE.VECTOR_SEARCH_WITH_RAG,`
`55`	`55`	`WORKFLOW_TYPE.HYBRID_SEARCH_WITH_RAG,`
	`56`	`+ WORKFLOW_TYPE.SEMANTIC_SEARCH_USING_SPARSE_ENCODERS,`
`56`	`57`	`].includes(workflowType)`
`57`	`58`	`);`
`58`	`59`	`}`