opensearch-project · saimedhi · Jun 10, 2025 · Jun 5, 2025 · Jun 6, 2025 · Jun 6, 2025
@@ -6,6 +6,7 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)
 ## [Unreleased 3.0](https://github.com/opensearch-project/anomaly-detection/compare/2.x...HEAD)
 ### Features
 ### Enhancements
+#### Added workflow preset for Semantic Search using Sparse Encoders (https://github.com/opensearch-project/dashboards-flow-framework/pull/742)
 ### Bug Fixes
 ### Infrastructure
 ### Documentation

@@ -155,6 +155,38 @@ export const OPENAI_CONFIGS = {
   } as RemoteEmbeddingModelConfig,
 };
 
+// Neural Sparse
+export const NEURAL_SPARSE_CONFIGS = {
+  [`opensearch-neural-sparse-encoding-v2-distill`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-v1`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-multilingual-v1`]: {
+    dimension: 105879,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-doc-v2-mini`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+    [`opensearch-neural-sparse-encoding-doc-v3-distill`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-doc-v1`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+  [`opensearch-neural-sparse-encoding-doc-v2-distill`]: {
+    dimension: 30522,
+    fieldName: 'passage_embedding',
+  } as RemoteEmbeddingModelConfig,
+};
+
 /**
  * Various constants pertaining to Workflow configs
  */
@@ -173,6 +205,7 @@ export enum WORKFLOW_TYPE {
   HYBRID_SEARCH = 'Hybrid Search',
   VECTOR_SEARCH_WITH_RAG = 'RAG with Vector Retrieval',
   HYBRID_SEARCH_WITH_RAG = 'RAG with Hybrid Search',
+  SEMANTIC_SEARCH_USING_SPARSE_ENCODERS = 'Semantic Search using Sparse Encoders',
   CUSTOM = 'Custom Search',
   UNKNOWN = 'Unknown',
 }
@@ -211,6 +244,7 @@ export enum MODEL_TYPE {
 export enum MODEL_CATEGORY {
   EMBEDDING = 'EMBEDDING',
   LLM = 'LLM',
+  SPARSE_ENCODER = 'SPARSE_ENCODER',
 }
 
 /**
@@ -293,6 +327,14 @@ export const COHERE_EMBEDDING_MODEL_DOCS_LINK =
 export const BEDROCK_TITAN_EMBEDDING_DOCS_LINK =
   'https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md#amazon-bedrock-titan-text-embedding';
 
+// Sparse Encoder Models Documentation Links
+export const OPENSEARCH_NEURAL_SPARSE_DOCS_LINK =
+  'https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v2-distill';
+
+// TODO: Update this with the official OpenSearch documentation URL when it's available
+export const SAGEMAKER_SPARSE_DEPLOY_LINK =
+  'https://github.com/zhichao-aws/opensearch-neural-sparse-sample/tree/main/examples/deploy_on_sagemaker';
+
 // ML Models setup Documentation Link
 export const ML_MODELS_SETUP_DOCS_LINK =
   'https://github.com/opensearch-project/dashboards-flow-framework/blob/main/documentation/models.md';
@@ -595,6 +637,18 @@ export const HYBRID_SEARCH_QUERY_MATCH_TERM = {
     },
   },
 };
+export const NEURAL_SPARSE_SEARCH_QUERY = {
+  _source: {
+    excludes: [VECTOR_FIELD_PATTERN],
+  },
+  query: {
+    neural_sparse: {
+      [VECTOR_FIELD_PATTERN]: {
+        query_tokens: VECTOR_PATTERN,
+      },
+    },
+  },
+};
 
 export const QUERY_PRESETS = [
   {
@@ -649,6 +703,10 @@ export const QUERY_PRESETS = [
     name: WORKFLOW_TYPE.MULTIMODAL_SEARCH,
     query: customStringify(MULTIMODAL_SEARCH_QUERY_BOOL),
   },
+  {
+    name: 'Neural Sparse Search Query',
+    query: customStringify(NEURAL_SPARSE_SEARCH_QUERY),
+  },
   {
     name: 'Semantic search (neural query)',
     query: customStringify(SEMANTIC_SEARCH_QUERY_NEURAL),

@@ -53,6 +53,7 @@ export function isVectorSearchUseCase(workflowType?: WORKFLOW_TYPE): boolean {
       WORKFLOW_TYPE.HYBRID_SEARCH,
       WORKFLOW_TYPE.VECTOR_SEARCH_WITH_RAG,
       WORKFLOW_TYPE.HYBRID_SEARCH_WITH_RAG,
+      WORKFLOW_TYPE.SEMANTIC_SEARCH_USING_SPARSE_ENCODERS,
     ].includes(workflowType)
   );
 }

@@ -473,6 +473,123 @@ POST /_plugins/_ml/models/_register
 }
 ```
 
+### Neural Sparse Encoding
+
+Deploy a sparse encoding model from the Hugging Face Model Hub to a SageMaker real-time inference endpoint using this [guide](https://github.com/zhichao-aws/opensearch-neural-sparse-sample/tree/main/examples/deploy_on_sagemaker).
+
+Connector:
+
+```
+POST /_plugins/_ml/connectors/_create
+{
+  "name": "Neural Sparse Encoding",
+  "description": "Test connector for Sagemaker model",
+  "version": 1,
+  "protocol": "aws_sigv4",
+  "credential": {
+        "access_key": "",
+        "secret_key": "",
+        "session_token": ""
+    },
+  "parameters": {
+    "region": "us-east-1",
+    "service_name": "sagemaker",
+    "model": "opensearch-neural-sparse-encoding-v2-distill"
+  },
+  "actions": [
+    {
+        "action_type": "predict",
+        "method": "POST",
+        "headers": {
+            "content-type": "application/json"
+        },
+        "url": "https://runtime.sagemaker.us-east-1.amazonaws.com/endpoints/xxxx/invocations",
+        "request_body": "[\"${parameters.text_doc}\"]",
+        "post_process_function": "String escape(def input) { if (input instanceof String) { if (input.contains('\\\\')) { input = input.replace('\\\\', '\\\\\\\\'); } if (input.contains('\"')) { input = input.replace('\"', '\\\\\"'); } if (input.contains('\r')) { input = input.replace('\r', '\\\\r'); } if (input.contains('\t')) { input = input.replace('\t', '\\\\t'); } if (input.contains('\n')) { input = input.replace('\n', '\\\\n'); } if (input.contains('\b')) { input = input.replace('\b', '\\\\b'); } if (input.contains('\f')) { input = input.replace('\f', '\\\\f'); } return input; } return input.toString(); } if (params.result == null || params.result.length == 0) { return '{\"dataAsMap\":{\"error\":\"no response error\"}}'; } String response = params.result[0].toString(); response = response.substring(1, response.length() - 1).replace('=', '\":').replace(', ', ',\"'); return '{\"dataAsMap\":{\"response\":{\"' + response + '}}}';"
+        }
+    ]
+}
+```
+
+Model:
+
+```
+POST /_plugins/_ml/models/_register
+{ "name": "Neural Sparse Encoding Model",
+  "function_name": "remote",
+  "version": "1.0.0",
+  "connector_id": "<connector-id>",
+  "description": "Test connector for Sagemaker model",
+  "interface": {
+    "input": {
+        "type": "object",
+        "properties": {
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text_doc": {
+                        "type": "string"
+                    }
+                },
+                "additionalProperties": true,
+                "required": [
+                    "text_doc"
+                ]
+            }
+        }
+    },
+    "output": {
+        "type": "object",
+        "properties": {
+            "inference_results": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "output": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "dataAsMap": {
+                                        "type": "object",
+                                        "properties": {
+                                            "response": {
+                                                    "type": "object",
+                                                    "additionalProperties": {
+                                                        "type": "number"
+                                                    }
+                                                }
+                                        },
+                                        "required": [
+                                            "response"
+                                        ]
+                                    }
+                                },
+                                "required": [
+                                    "dataAsMap"
+                                ]
+                            }
+                        },
+                        "status_code": {
+                            "type": "integer"
+                        }
+                    },
+                    "required": [
+                        "output",
+                        "status_code"
+                    ]
+                }
+            }
+        },
+        "required": [
+            "inference_results"
+        ]
+    }
+  }
+}
+```
+
 ## Generative models
 
 ### Claude 3 Sonnet (hosted on Amazon Bedrock)